1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation  ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
14#include "AArch64CallingConvention.h"
15#include "AArch64ExpandImm.h"
16#include "AArch64MachineFunctionInfo.h"
17#include "AArch64PerfectShuffle.h"
18#include "AArch64RegisterInfo.h"
19#include "AArch64Subtarget.h"
20#include "MCTargetDesc/AArch64AddressingModes.h"
21#include "Utils/AArch64BaseInfo.h"
22#include "llvm/ADT/APFloat.h"
23#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/ArrayRef.h"
25#include "llvm/ADT/STLExtras.h"
26#include "llvm/ADT/SmallSet.h"
27#include "llvm/ADT/SmallVector.h"
28#include "llvm/ADT/Statistic.h"
29#include "llvm/ADT/StringRef.h"
30#include "llvm/ADT/Twine.h"
31#include "llvm/Analysis/LoopInfo.h"
32#include "llvm/Analysis/MemoryLocation.h"
33#include "llvm/Analysis/ObjCARCUtil.h"
34#include "llvm/Analysis/OptimizationRemarkEmitter.h"
35#include "llvm/Analysis/TargetTransformInfo.h"
36#include "llvm/Analysis/ValueTracking.h"
37#include "llvm/Analysis/VectorUtils.h"
38#include "llvm/CodeGen/Analysis.h"
39#include "llvm/CodeGen/CallingConvLower.h"
40#include "llvm/CodeGen/ComplexDeinterleavingPass.h"
41#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
42#include "llvm/CodeGen/GlobalISel/Utils.h"
43#include "llvm/CodeGen/ISDOpcodes.h"
44#include "llvm/CodeGen/MachineBasicBlock.h"
45#include "llvm/CodeGen/MachineFrameInfo.h"
46#include "llvm/CodeGen/MachineFunction.h"
47#include "llvm/CodeGen/MachineInstr.h"
48#include "llvm/CodeGen/MachineInstrBuilder.h"
49#include "llvm/CodeGen/MachineMemOperand.h"
50#include "llvm/CodeGen/MachineRegisterInfo.h"
51#include "llvm/CodeGen/MachineValueType.h"
52#include "llvm/CodeGen/RuntimeLibcalls.h"
53#include "llvm/CodeGen/SelectionDAG.h"
54#include "llvm/CodeGen/SelectionDAGNodes.h"
55#include "llvm/CodeGen/TargetCallingConv.h"
56#include "llvm/CodeGen/TargetInstrInfo.h"
57#include "llvm/CodeGen/TargetOpcodes.h"
58#include "llvm/CodeGen/ValueTypes.h"
59#include "llvm/IR/Attributes.h"
60#include "llvm/IR/Constants.h"
61#include "llvm/IR/DataLayout.h"
62#include "llvm/IR/DebugLoc.h"
63#include "llvm/IR/DerivedTypes.h"
64#include "llvm/IR/Function.h"
65#include "llvm/IR/GetElementPtrTypeIterator.h"
66#include "llvm/IR/GlobalValue.h"
67#include "llvm/IR/IRBuilder.h"
68#include "llvm/IR/Instruction.h"
69#include "llvm/IR/Instructions.h"
70#include "llvm/IR/IntrinsicInst.h"
71#include "llvm/IR/Intrinsics.h"
72#include "llvm/IR/IntrinsicsAArch64.h"
73#include "llvm/IR/Module.h"
74#include "llvm/IR/PatternMatch.h"
75#include "llvm/IR/Type.h"
76#include "llvm/IR/Use.h"
77#include "llvm/IR/Value.h"
78#include "llvm/MC/MCRegisterInfo.h"
79#include "llvm/Support/AtomicOrdering.h"
80#include "llvm/Support/Casting.h"
81#include "llvm/Support/CodeGen.h"
82#include "llvm/Support/CommandLine.h"
83#include "llvm/Support/Debug.h"
84#include "llvm/Support/ErrorHandling.h"
85#include "llvm/Support/InstructionCost.h"
86#include "llvm/Support/KnownBits.h"
87#include "llvm/Support/MathExtras.h"
88#include "llvm/Support/raw_ostream.h"
89#include "llvm/Target/TargetMachine.h"
90#include "llvm/Target/TargetOptions.h"
91#include "llvm/TargetParser/Triple.h"
92#include <algorithm>
93#include <bitset>
94#include <cassert>
95#include <cctype>
96#include <cstdint>
97#include <cstdlib>
98#include <iterator>
99#include <limits>
100#include <optional>
101#include <tuple>
102#include <utility>
103#include <vector>
104
105using namespace llvm;
106using namespace llvm::PatternMatch;
107
108#define DEBUG_TYPE "aarch64-lower"
109
110STATISTIC(NumTailCalls, "Number of tail calls");
111STATISTIC(NumShiftInserts, "Number of vector shift inserts");
112STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
113
114// FIXME: The necessary dtprel relocations don't seem to be supported
115// well in the GNU bfd and gold linkers at the moment. Therefore, by
116// default, for now, fall back to GeneralDynamic code generation.
117cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
118    "aarch64-elf-ldtls-generation", cl::Hidden,
119    cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
120    cl::init(false));
121
122static cl::opt<bool>
123EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
124                         cl::desc("Enable AArch64 logical imm instruction "
125                                  "optimization"),
126                         cl::init(true));
127
128// Temporary option added for the purpose of testing functionality added
129// to DAGCombiner.cpp in D92230. It is expected that this can be removed
130// in future when both implementations will be based off MGATHER rather
131// than the GLD1 nodes added for the SVE gather load intrinsics.
132static cl::opt<bool>
133EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
134                                cl::desc("Combine extends of AArch64 masked "
135                                         "gather intrinsics"),
136                                cl::init(true));
137
138static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
139                                    cl::desc("Combine ext and trunc to TBL"),
140                                    cl::init(true));
141
142// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
143// bottleneck after this transform on high end CPU. So this max leaf node
144// limitation is guard cmp+ccmp will be profitable.
145static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
146                                 cl::desc("Maximum of xors"));
147
148/// Value type used for condition codes.
149static const MVT MVT_CC = MVT::i32;
150
151static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
152                                       AArch64::X3, AArch64::X4, AArch64::X5,
153                                       AArch64::X6, AArch64::X7};
154static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
155                                       AArch64::Q3, AArch64::Q4, AArch64::Q5,
156                                       AArch64::Q6, AArch64::Q7};
157
158ArrayRef<MCPhysReg> llvm::AArch64::getGPRArgRegs() { return GPRArgRegs; }
159
160ArrayRef<MCPhysReg> llvm::AArch64::getFPRArgRegs() { return FPRArgRegs; }
161
162static inline EVT getPackedSVEVectorVT(EVT VT) {
163  switch (VT.getSimpleVT().SimpleTy) {
164  default:
165    llvm_unreachable("unexpected element type for vector");
166  case MVT::i8:
167    return MVT::nxv16i8;
168  case MVT::i16:
169    return MVT::nxv8i16;
170  case MVT::i32:
171    return MVT::nxv4i32;
172  case MVT::i64:
173    return MVT::nxv2i64;
174  case MVT::f16:
175    return MVT::nxv8f16;
176  case MVT::f32:
177    return MVT::nxv4f32;
178  case MVT::f64:
179    return MVT::nxv2f64;
180  case MVT::bf16:
181    return MVT::nxv8bf16;
182  }
183}
184
185// NOTE: Currently there's only a need to return integer vector types. If this
186// changes then just add an extra "type" parameter.
187static inline EVT getPackedSVEVectorVT(ElementCount EC) {
188  switch (EC.getKnownMinValue()) {
189  default:
190    llvm_unreachable("unexpected element count for vector");
191  case 16:
192    return MVT::nxv16i8;
193  case 8:
194    return MVT::nxv8i16;
195  case 4:
196    return MVT::nxv4i32;
197  case 2:
198    return MVT::nxv2i64;
199  }
200}
201
202static inline EVT getPromotedVTForPredicate(EVT VT) {
203  assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
204         "Expected scalable predicate vector type!");
205  switch (VT.getVectorMinNumElements()) {
206  default:
207    llvm_unreachable("unexpected element count for vector");
208  case 2:
209    return MVT::nxv2i64;
210  case 4:
211    return MVT::nxv4i32;
212  case 8:
213    return MVT::nxv8i16;
214  case 16:
215    return MVT::nxv16i8;
216  }
217}
218
219/// Returns true if VT's elements occupy the lowest bit positions of its
220/// associated register class without any intervening space.
221///
222/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
223/// same register class, but only nxv8f16 can be treated as a packed vector.
224static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
225  assert(VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
226         "Expected legal vector type!");
227  return VT.isFixedLengthVector() ||
228         VT.getSizeInBits().getKnownMinValue() == AArch64::SVEBitsPerBlock;
229}
230
231// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
232// predicate and end with a passthru value matching the result type.
233static bool isMergePassthruOpcode(unsigned Opc) {
234  switch (Opc) {
235  default:
236    return false;
237  case AArch64ISD::BITREVERSE_MERGE_PASSTHRU:
238  case AArch64ISD::BSWAP_MERGE_PASSTHRU:
239  case AArch64ISD::REVH_MERGE_PASSTHRU:
240  case AArch64ISD::REVW_MERGE_PASSTHRU:
241  case AArch64ISD::REVD_MERGE_PASSTHRU:
242  case AArch64ISD::CTLZ_MERGE_PASSTHRU:
243  case AArch64ISD::CTPOP_MERGE_PASSTHRU:
244  case AArch64ISD::DUP_MERGE_PASSTHRU:
245  case AArch64ISD::ABS_MERGE_PASSTHRU:
246  case AArch64ISD::NEG_MERGE_PASSTHRU:
247  case AArch64ISD::FNEG_MERGE_PASSTHRU:
248  case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU:
249  case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU:
250  case AArch64ISD::FCEIL_MERGE_PASSTHRU:
251  case AArch64ISD::FFLOOR_MERGE_PASSTHRU:
252  case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU:
253  case AArch64ISD::FRINT_MERGE_PASSTHRU:
254  case AArch64ISD::FROUND_MERGE_PASSTHRU:
255  case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU:
256  case AArch64ISD::FTRUNC_MERGE_PASSTHRU:
257  case AArch64ISD::FP_ROUND_MERGE_PASSTHRU:
258  case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU:
259  case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU:
260  case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU:
261  case AArch64ISD::FCVTZU_MERGE_PASSTHRU:
262  case AArch64ISD::FCVTZS_MERGE_PASSTHRU:
263  case AArch64ISD::FSQRT_MERGE_PASSTHRU:
264  case AArch64ISD::FRECPX_MERGE_PASSTHRU:
265  case AArch64ISD::FABS_MERGE_PASSTHRU:
266    return true;
267  }
268}
269
270// Returns true if inactive lanes are known to be zeroed by construction.
271static bool isZeroingInactiveLanes(SDValue Op) {
272  switch (Op.getOpcode()) {
273  default:
274    // We guarantee i1 splat_vectors to zero the other lanes by
275    // implementing it with ptrue and possibly a punpklo for nxv1i1.
276    if (ISD::isConstantSplatVectorAllOnes(Op.getNode()))
277      return true;
278    return false;
279  case AArch64ISD::PTRUE:
280  case AArch64ISD::SETCC_MERGE_ZERO:
281    return true;
282  case ISD::INTRINSIC_WO_CHAIN:
283    switch (Op.getConstantOperandVal(0)) {
284    default:
285      return false;
286    case Intrinsic::aarch64_sve_ptrue:
287    case Intrinsic::aarch64_sve_pnext:
288    case Intrinsic::aarch64_sve_cmpeq:
289    case Intrinsic::aarch64_sve_cmpne:
290    case Intrinsic::aarch64_sve_cmpge:
291    case Intrinsic::aarch64_sve_cmpgt:
292    case Intrinsic::aarch64_sve_cmphs:
293    case Intrinsic::aarch64_sve_cmphi:
294    case Intrinsic::aarch64_sve_cmpeq_wide:
295    case Intrinsic::aarch64_sve_cmpne_wide:
296    case Intrinsic::aarch64_sve_cmpge_wide:
297    case Intrinsic::aarch64_sve_cmpgt_wide:
298    case Intrinsic::aarch64_sve_cmplt_wide:
299    case Intrinsic::aarch64_sve_cmple_wide:
300    case Intrinsic::aarch64_sve_cmphs_wide:
301    case Intrinsic::aarch64_sve_cmphi_wide:
302    case Intrinsic::aarch64_sve_cmplo_wide:
303    case Intrinsic::aarch64_sve_cmpls_wide:
304    case Intrinsic::aarch64_sve_fcmpeq:
305    case Intrinsic::aarch64_sve_fcmpne:
306    case Intrinsic::aarch64_sve_fcmpge:
307    case Intrinsic::aarch64_sve_fcmpgt:
308    case Intrinsic::aarch64_sve_fcmpuo:
309    case Intrinsic::aarch64_sve_facgt:
310    case Intrinsic::aarch64_sve_facge:
311    case Intrinsic::aarch64_sve_whilege:
312    case Intrinsic::aarch64_sve_whilegt:
313    case Intrinsic::aarch64_sve_whilehi:
314    case Intrinsic::aarch64_sve_whilehs:
315    case Intrinsic::aarch64_sve_whilele:
316    case Intrinsic::aarch64_sve_whilelo:
317    case Intrinsic::aarch64_sve_whilels:
318    case Intrinsic::aarch64_sve_whilelt:
319    case Intrinsic::aarch64_sve_match:
320    case Intrinsic::aarch64_sve_nmatch:
321    case Intrinsic::aarch64_sve_whilege_x2:
322    case Intrinsic::aarch64_sve_whilegt_x2:
323    case Intrinsic::aarch64_sve_whilehi_x2:
324    case Intrinsic::aarch64_sve_whilehs_x2:
325    case Intrinsic::aarch64_sve_whilele_x2:
326    case Intrinsic::aarch64_sve_whilelo_x2:
327    case Intrinsic::aarch64_sve_whilels_x2:
328    case Intrinsic::aarch64_sve_whilelt_x2:
329      return true;
330    }
331  }
332}
333
334AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
335                                             const AArch64Subtarget &STI)
336    : TargetLowering(TM), Subtarget(&STI) {
337  // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
338  // we have to make something up. Arbitrarily, choose ZeroOrOne.
339  setBooleanContents(ZeroOrOneBooleanContent);
340  // When comparing vectors the result sets the different elements in the
341  // vector to all-one or all-zero.
342  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
343
344  // Set up the register classes.
345  addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
346  addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
347
348  if (Subtarget->hasLS64()) {
349    addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
350    setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
351    setOperationAction(ISD::STORE, MVT::i64x8, Custom);
352  }
353
354  if (Subtarget->hasFPARMv8()) {
355    addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
356    addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
357    addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
358    addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
359    addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
360  }
361
362  if (Subtarget->hasNEON()) {
363    addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
364    addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
365    // Someone set us up the NEON.
366    addDRTypeForNEON(MVT::v2f32);
367    addDRTypeForNEON(MVT::v8i8);
368    addDRTypeForNEON(MVT::v4i16);
369    addDRTypeForNEON(MVT::v2i32);
370    addDRTypeForNEON(MVT::v1i64);
371    addDRTypeForNEON(MVT::v1f64);
372    addDRTypeForNEON(MVT::v4f16);
373    if (Subtarget->hasBF16())
374      addDRTypeForNEON(MVT::v4bf16);
375
376    addQRTypeForNEON(MVT::v4f32);
377    addQRTypeForNEON(MVT::v2f64);
378    addQRTypeForNEON(MVT::v16i8);
379    addQRTypeForNEON(MVT::v8i16);
380    addQRTypeForNEON(MVT::v4i32);
381    addQRTypeForNEON(MVT::v2i64);
382    addQRTypeForNEON(MVT::v8f16);
383    if (Subtarget->hasBF16())
384      addQRTypeForNEON(MVT::v8bf16);
385  }
386
387  if (Subtarget->hasSVEorSME()) {
388    // Add legal sve predicate types
389    addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
390    addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
391    addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
392    addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
393    addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
394
395    // Add legal sve data types
396    addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
397    addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
398    addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
399    addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
400
401    addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
402    addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
403    addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
404    addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
405    addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
406    addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
407
408    if (Subtarget->hasBF16()) {
409      addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
410      addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
411      addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
412    }
413
414    if (Subtarget->useSVEForFixedLengthVectors()) {
415      for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
416        if (useSVEForFixedLengthVectorVT(VT))
417          addRegisterClass(VT, &AArch64::ZPRRegClass);
418
419      for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
420        if (useSVEForFixedLengthVectorVT(VT))
421          addRegisterClass(VT, &AArch64::ZPRRegClass);
422    }
423  }
424
425  if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
426    addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass);
427    setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1);
428    setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1);
429
430    setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom);
431    setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand);
432  }
433
434  // Compute derived properties from the register classes
435  computeRegisterProperties(Subtarget->getRegisterInfo());
436
437  // Provide all sorts of operation actions
438  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
439  setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
440  setOperationAction(ISD::SETCC, MVT::i32, Custom);
441  setOperationAction(ISD::SETCC, MVT::i64, Custom);
442  setOperationAction(ISD::SETCC, MVT::f16, Custom);
443  setOperationAction(ISD::SETCC, MVT::f32, Custom);
444  setOperationAction(ISD::SETCC, MVT::f64, Custom);
445  setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom);
446  setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom);
447  setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom);
448  setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);
449  setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom);
450  setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);
451  setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
452  setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
453  setOperationAction(ISD::BRCOND, MVT::Other, Custom);
454  setOperationAction(ISD::BR_CC, MVT::i32, Custom);
455  setOperationAction(ISD::BR_CC, MVT::i64, Custom);
456  setOperationAction(ISD::BR_CC, MVT::f16, Custom);
457  setOperationAction(ISD::BR_CC, MVT::f32, Custom);
458  setOperationAction(ISD::BR_CC, MVT::f64, Custom);
459  setOperationAction(ISD::SELECT, MVT::i32, Custom);
460  setOperationAction(ISD::SELECT, MVT::i64, Custom);
461  setOperationAction(ISD::SELECT, MVT::f16, Custom);
462  setOperationAction(ISD::SELECT, MVT::bf16, Custom);
463  setOperationAction(ISD::SELECT, MVT::f32, Custom);
464  setOperationAction(ISD::SELECT, MVT::f64, Custom);
465  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
466  setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
467  setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);
468  setOperationAction(ISD::SELECT_CC, MVT::bf16, Expand);
469  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
470  setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
471  setOperationAction(ISD::BR_JT, MVT::Other, Custom);
472  setOperationAction(ISD::JumpTable, MVT::i64, Custom);
473  setOperationAction(ISD::SETCCCARRY, MVT::i64, Custom);
474
475  setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
476  setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
477  setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
478
479  setOperationAction(ISD::FREM, MVT::f32, Expand);
480  setOperationAction(ISD::FREM, MVT::f64, Expand);
481  setOperationAction(ISD::FREM, MVT::f80, Expand);
482
483  setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
484
485  // Custom lowering hooks are needed for XOR
486  // to fold it into CSINC/CSINV.
487  setOperationAction(ISD::XOR, MVT::i32, Custom);
488  setOperationAction(ISD::XOR, MVT::i64, Custom);
489
490  // Virtually no operation on f128 is legal, but LLVM can't expand them when
491  // there's a valid register class, so we need custom operations in most cases.
492  setOperationAction(ISD::FABS, MVT::f128, Expand);
493  setOperationAction(ISD::FADD, MVT::f128, LibCall);
494  setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
495  setOperationAction(ISD::FCOS, MVT::f128, Expand);
496  setOperationAction(ISD::FDIV, MVT::f128, LibCall);
497  setOperationAction(ISD::FMA, MVT::f128, Expand);
498  setOperationAction(ISD::FMUL, MVT::f128, LibCall);
499  setOperationAction(ISD::FNEG, MVT::f128, Expand);
500  setOperationAction(ISD::FPOW, MVT::f128, Expand);
501  setOperationAction(ISD::FREM, MVT::f128, Expand);
502  setOperationAction(ISD::FRINT, MVT::f128, Expand);
503  setOperationAction(ISD::FSIN, MVT::f128, Expand);
504  setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
505  setOperationAction(ISD::FSQRT, MVT::f128, Expand);
506  setOperationAction(ISD::FSUB, MVT::f128, LibCall);
507  setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
508  setOperationAction(ISD::SETCC, MVT::f128, Custom);
509  setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom);
510  setOperationAction(ISD::STRICT_FSETCCS, MVT::f128, Custom);
511  setOperationAction(ISD::BR_CC, MVT::f128, Custom);
512  setOperationAction(ISD::SELECT, MVT::f128, Custom);
513  setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
514  setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
515  // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
516  // aren't handled.
517
518  // Lowering for many of the conversions is actually specified by the non-f128
519  // type. The LowerXXX function will be trivial when f128 isn't involved.
520  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
521  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
522  setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
523  setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
524  setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
525  setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom);
526  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
527  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
528  setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
529  setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
530  setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
531  setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom);
532  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
533  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
534  setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
535  setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
536  setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
537  setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom);
538  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
539  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
540  setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
541  setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
542  setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
543  setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);
544  setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
545  setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
546  setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
547  setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
548  setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
549  setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
550
551  setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i32, Custom);
552  setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
553  setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i32, Custom);
554  setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
555
556  // Variable arguments.
557  setOperationAction(ISD::VASTART, MVT::Other, Custom);
558  setOperationAction(ISD::VAARG, MVT::Other, Custom);
559  setOperationAction(ISD::VACOPY, MVT::Other, Custom);
560  setOperationAction(ISD::VAEND, MVT::Other, Expand);
561
562  // Variable-sized objects.
563  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
564  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
565
566  // Lowering Funnel Shifts to EXTR
567  setOperationAction(ISD::FSHR, MVT::i32, Custom);
568  setOperationAction(ISD::FSHR, MVT::i64, Custom);
569  setOperationAction(ISD::FSHL, MVT::i32, Custom);
570  setOperationAction(ISD::FSHL, MVT::i64, Custom);
571
572  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
573
574  // Constant pool entries
575  setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
576
577  // BlockAddress
578  setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
579
580  // AArch64 lacks both left-rotate and popcount instructions.
581  setOperationAction(ISD::ROTL, MVT::i32, Expand);
582  setOperationAction(ISD::ROTL, MVT::i64, Expand);
583  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
584    setOperationAction(ISD::ROTL, VT, Expand);
585    setOperationAction(ISD::ROTR, VT, Expand);
586  }
587
588  // AArch64 doesn't have i32 MULH{S|U}.
589  setOperationAction(ISD::MULHU, MVT::i32, Expand);
590  setOperationAction(ISD::MULHS, MVT::i32, Expand);
591
592  // AArch64 doesn't have {U|S}MUL_LOHI.
593  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
594  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
595  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
596  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
597
598  if (Subtarget->hasCSSC()) {
599    setOperationAction(ISD::CTPOP, MVT::i32, Legal);
600    setOperationAction(ISD::CTPOP, MVT::i64, Legal);
601    setOperationAction(ISD::CTPOP, MVT::i128, Expand);
602
603    setOperationAction(ISD::PARITY, MVT::i128, Expand);
604
605    setOperationAction(ISD::CTTZ, MVT::i32, Legal);
606    setOperationAction(ISD::CTTZ, MVT::i64, Legal);
607    setOperationAction(ISD::CTTZ, MVT::i128, Expand);
608
609    setOperationAction(ISD::ABS, MVT::i32, Legal);
610    setOperationAction(ISD::ABS, MVT::i64, Legal);
611
612    setOperationAction(ISD::SMAX, MVT::i32, Legal);
613    setOperationAction(ISD::SMAX, MVT::i64, Legal);
614    setOperationAction(ISD::UMAX, MVT::i32, Legal);
615    setOperationAction(ISD::UMAX, MVT::i64, Legal);
616
617    setOperationAction(ISD::SMIN, MVT::i32, Legal);
618    setOperationAction(ISD::SMIN, MVT::i64, Legal);
619    setOperationAction(ISD::UMIN, MVT::i32, Legal);
620    setOperationAction(ISD::UMIN, MVT::i64, Legal);
621  } else {
622    setOperationAction(ISD::CTPOP, MVT::i32, Custom);
623    setOperationAction(ISD::CTPOP, MVT::i64, Custom);
624    setOperationAction(ISD::CTPOP, MVT::i128, Custom);
625
626    setOperationAction(ISD::PARITY, MVT::i64, Custom);
627    setOperationAction(ISD::PARITY, MVT::i128, Custom);
628
629    setOperationAction(ISD::ABS, MVT::i32, Custom);
630    setOperationAction(ISD::ABS, MVT::i64, Custom);
631  }
632
633  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
634  setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
635  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
636    setOperationAction(ISD::SDIVREM, VT, Expand);
637    setOperationAction(ISD::UDIVREM, VT, Expand);
638  }
639  setOperationAction(ISD::SREM, MVT::i32, Expand);
640  setOperationAction(ISD::SREM, MVT::i64, Expand);
641  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
642  setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
643  setOperationAction(ISD::UREM, MVT::i32, Expand);
644  setOperationAction(ISD::UREM, MVT::i64, Expand);
645
646  // Custom lower Add/Sub/Mul with overflow.
647  setOperationAction(ISD::SADDO, MVT::i32, Custom);
648  setOperationAction(ISD::SADDO, MVT::i64, Custom);
649  setOperationAction(ISD::UADDO, MVT::i32, Custom);
650  setOperationAction(ISD::UADDO, MVT::i64, Custom);
651  setOperationAction(ISD::SSUBO, MVT::i32, Custom);
652  setOperationAction(ISD::SSUBO, MVT::i64, Custom);
653  setOperationAction(ISD::USUBO, MVT::i32, Custom);
654  setOperationAction(ISD::USUBO, MVT::i64, Custom);
655  setOperationAction(ISD::SMULO, MVT::i32, Custom);
656  setOperationAction(ISD::SMULO, MVT::i64, Custom);
657  setOperationAction(ISD::UMULO, MVT::i32, Custom);
658  setOperationAction(ISD::UMULO, MVT::i64, Custom);
659
660  setOperationAction(ISD::UADDO_CARRY, MVT::i32, Custom);
661  setOperationAction(ISD::UADDO_CARRY, MVT::i64, Custom);
662  setOperationAction(ISD::USUBO_CARRY, MVT::i32, Custom);
663  setOperationAction(ISD::USUBO_CARRY, MVT::i64, Custom);
664  setOperationAction(ISD::SADDO_CARRY, MVT::i32, Custom);
665  setOperationAction(ISD::SADDO_CARRY, MVT::i64, Custom);
666  setOperationAction(ISD::SSUBO_CARRY, MVT::i32, Custom);
667  setOperationAction(ISD::SSUBO_CARRY, MVT::i64, Custom);
668
669  setOperationAction(ISD::FSIN, MVT::f32, Expand);
670  setOperationAction(ISD::FSIN, MVT::f64, Expand);
671  setOperationAction(ISD::FCOS, MVT::f32, Expand);
672  setOperationAction(ISD::FCOS, MVT::f64, Expand);
673  setOperationAction(ISD::FPOW, MVT::f32, Expand);
674  setOperationAction(ISD::FPOW, MVT::f64, Expand);
675  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
676  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
677  if (Subtarget->hasFullFP16())
678    setOperationAction(ISD::FCOPYSIGN, MVT::f16, Custom);
679  else
680    setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote);
681
682  for (auto Op : {ISD::FREM,        ISD::FPOW,         ISD::FPOWI,
683                  ISD::FCOS,        ISD::FSIN,         ISD::FSINCOS,
684                  ISD::FEXP,        ISD::FEXP2,        ISD::FEXP10,
685                  ISD::FLOG,        ISD::FLOG2,        ISD::FLOG10,
686                  ISD::STRICT_FREM,
687                  ISD::STRICT_FPOW, ISD::STRICT_FPOWI, ISD::STRICT_FCOS,
688                  ISD::STRICT_FSIN, ISD::STRICT_FEXP,  ISD::STRICT_FEXP2,
689                  ISD::STRICT_FLOG, ISD::STRICT_FLOG2, ISD::STRICT_FLOG10}) {
690    setOperationAction(Op, MVT::f16, Promote);
691    setOperationAction(Op, MVT::v4f16, Expand);
692    setOperationAction(Op, MVT::v8f16, Expand);
693  }
694
695  if (!Subtarget->hasFullFP16()) {
696    for (auto Op :
697         {ISD::SETCC,          ISD::SELECT_CC,
698          ISD::BR_CC,          ISD::FADD,           ISD::FSUB,
699          ISD::FMUL,           ISD::FDIV,           ISD::FMA,
700          ISD::FNEG,           ISD::FABS,           ISD::FCEIL,
701          ISD::FSQRT,          ISD::FFLOOR,         ISD::FNEARBYINT,
702          ISD::FRINT,          ISD::FROUND,         ISD::FROUNDEVEN,
703          ISD::FTRUNC,         ISD::FMINNUM,        ISD::FMAXNUM,
704          ISD::FMINIMUM,       ISD::FMAXIMUM,       ISD::STRICT_FADD,
705          ISD::STRICT_FSUB,    ISD::STRICT_FMUL,    ISD::STRICT_FDIV,
706          ISD::STRICT_FMA,     ISD::STRICT_FCEIL,   ISD::STRICT_FFLOOR,
707          ISD::STRICT_FSQRT,   ISD::STRICT_FRINT,   ISD::STRICT_FNEARBYINT,
708          ISD::STRICT_FROUND,  ISD::STRICT_FTRUNC,  ISD::STRICT_FROUNDEVEN,
709          ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM, ISD::STRICT_FMINIMUM,
710          ISD::STRICT_FMAXIMUM})
711      setOperationAction(Op, MVT::f16, Promote);
712
713    // Round-to-integer need custom lowering for fp16, as Promote doesn't work
714    // because the result type is integer.
715    for (auto Op : {ISD::LROUND, ISD::LLROUND, ISD::LRINT, ISD::LLRINT,
716                    ISD::STRICT_LROUND, ISD::STRICT_LLROUND, ISD::STRICT_LRINT,
717                    ISD::STRICT_LLRINT})
718      setOperationAction(Op, MVT::f16, Custom);
719
720    // promote v4f16 to v4f32 when that is known to be safe.
721    setOperationPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32);
722    setOperationPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32);
723    setOperationPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32);
724    setOperationPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32);
725
726    setOperationAction(ISD::FABS,        MVT::v4f16, Expand);
727    setOperationAction(ISD::FNEG,        MVT::v4f16, Expand);
728    setOperationAction(ISD::FROUND,      MVT::v4f16, Expand);
729    setOperationAction(ISD::FROUNDEVEN,  MVT::v4f16, Expand);
730    setOperationAction(ISD::FMA,         MVT::v4f16, Expand);
731    setOperationAction(ISD::SETCC,       MVT::v4f16, Custom);
732    setOperationAction(ISD::BR_CC,       MVT::v4f16, Expand);
733    setOperationAction(ISD::SELECT,      MVT::v4f16, Expand);
734    setOperationAction(ISD::SELECT_CC,   MVT::v4f16, Expand);
735    setOperationAction(ISD::FTRUNC,      MVT::v4f16, Expand);
736    setOperationAction(ISD::FCOPYSIGN,   MVT::v4f16, Expand);
737    setOperationAction(ISD::FFLOOR,      MVT::v4f16, Expand);
738    setOperationAction(ISD::FCEIL,       MVT::v4f16, Expand);
739    setOperationAction(ISD::FRINT,       MVT::v4f16, Expand);
740    setOperationAction(ISD::FNEARBYINT,  MVT::v4f16, Expand);
741    setOperationAction(ISD::FSQRT,       MVT::v4f16, Expand);
742
743    setOperationAction(ISD::FABS,        MVT::v8f16, Expand);
744    setOperationAction(ISD::FADD,        MVT::v8f16, Expand);
745    setOperationAction(ISD::FCEIL,       MVT::v8f16, Expand);
746    setOperationAction(ISD::FCOPYSIGN,   MVT::v8f16, Expand);
747    setOperationAction(ISD::FDIV,        MVT::v8f16, Expand);
748    setOperationAction(ISD::FFLOOR,      MVT::v8f16, Expand);
749    setOperationAction(ISD::FMA,         MVT::v8f16, Expand);
750    setOperationAction(ISD::FMUL,        MVT::v8f16, Expand);
751    setOperationAction(ISD::FNEARBYINT,  MVT::v8f16, Expand);
752    setOperationAction(ISD::FNEG,        MVT::v8f16, Expand);
753    setOperationAction(ISD::FROUND,      MVT::v8f16, Expand);
754    setOperationAction(ISD::FROUNDEVEN,  MVT::v8f16, Expand);
755    setOperationAction(ISD::FRINT,       MVT::v8f16, Expand);
756    setOperationAction(ISD::FSQRT,       MVT::v8f16, Expand);
757    setOperationAction(ISD::FSUB,        MVT::v8f16, Expand);
758    setOperationAction(ISD::FTRUNC,      MVT::v8f16, Expand);
759    setOperationAction(ISD::SETCC,       MVT::v8f16, Expand);
760    setOperationAction(ISD::BR_CC,       MVT::v8f16, Expand);
761    setOperationAction(ISD::SELECT,      MVT::v8f16, Expand);
762    setOperationAction(ISD::SELECT_CC,   MVT::v8f16, Expand);
763    setOperationAction(ISD::FP_EXTEND,   MVT::v8f16, Expand);
764  }
765
766  // AArch64 has implementations of a lot of rounding-like FP operations.
767  for (auto Op :
768       {ISD::FFLOOR,          ISD::FNEARBYINT,      ISD::FCEIL,
769        ISD::FRINT,           ISD::FTRUNC,          ISD::FROUND,
770        ISD::FROUNDEVEN,      ISD::FMINNUM,         ISD::FMAXNUM,
771        ISD::FMINIMUM,        ISD::FMAXIMUM,        ISD::LROUND,
772        ISD::LLROUND,         ISD::LRINT,           ISD::LLRINT,
773        ISD::STRICT_FFLOOR,   ISD::STRICT_FCEIL,    ISD::STRICT_FNEARBYINT,
774        ISD::STRICT_FRINT,    ISD::STRICT_FTRUNC,   ISD::STRICT_FROUNDEVEN,
775        ISD::STRICT_FROUND,   ISD::STRICT_FMINNUM,  ISD::STRICT_FMAXNUM,
776        ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM, ISD::STRICT_LROUND,
777        ISD::STRICT_LLROUND,  ISD::STRICT_LRINT,    ISD::STRICT_LLRINT}) {
778    for (MVT Ty : {MVT::f32, MVT::f64})
779      setOperationAction(Op, Ty, Legal);
780    if (Subtarget->hasFullFP16())
781      setOperationAction(Op, MVT::f16, Legal);
782  }
783
784  // Basic strict FP operations are legal
785  for (auto Op : {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL,
786                  ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT}) {
787    for (MVT Ty : {MVT::f32, MVT::f64})
788      setOperationAction(Op, Ty, Legal);
789    if (Subtarget->hasFullFP16())
790      setOperationAction(Op, MVT::f16, Legal);
791  }
792
793  // Strict conversion to a larger type is legal
794  for (auto VT : {MVT::f32, MVT::f64})
795    setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal);
796
797  setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
798
799  setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);
800  setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
801
802  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
803  if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
804    setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, LibCall);
805    setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, LibCall);
806  } else {
807    setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand);
808    setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Expand);
809  }
810  setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom);
811  setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
812
813  // Generate outline atomics library calls only if LSE was not specified for
814  // subtarget
815  if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
816    setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, LibCall);
817    setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, LibCall);
818    setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, LibCall);
819    setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, LibCall);
820    setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, LibCall);
821    setOperationAction(ISD::ATOMIC_SWAP, MVT::i8, LibCall);
822    setOperationAction(ISD::ATOMIC_SWAP, MVT::i16, LibCall);
823    setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, LibCall);
824    setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, LibCall);
825    setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i8, LibCall);
826    setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i16, LibCall);
827    setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, LibCall);
828    setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, LibCall);
829    setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i8, LibCall);
830    setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i16, LibCall);
831    setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, LibCall);
832    setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, LibCall);
833    setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i8, LibCall);
834    setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i16, LibCall);
835    setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i32, LibCall);
836    setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i64, LibCall);
837    setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i8, LibCall);
838    setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i16, LibCall);
839    setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall);
840    setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, LibCall);
841#define LCALLNAMES(A, B, N)                                                    \
842  setLibcallName(A##N##_RELAX, #B #N "_relax");                                \
843  setLibcallName(A##N##_ACQ, #B #N "_acq");                                    \
844  setLibcallName(A##N##_REL, #B #N "_rel");                                    \
845  setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
846#define LCALLNAME4(A, B)                                                       \
847  LCALLNAMES(A, B, 1)                                                          \
848  LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
849#define LCALLNAME5(A, B)                                                       \
850  LCALLNAMES(A, B, 1)                                                          \
851  LCALLNAMES(A, B, 2)                                                          \
852  LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
853    LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
854    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
855    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
856    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
857    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
858    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
859#undef LCALLNAMES
860#undef LCALLNAME4
861#undef LCALLNAME5
862  }
863
864  if (Subtarget->hasLSE128()) {
865    // Custom lowering because i128 is not legal. Must be replaced by 2x64
866    // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
867    setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i128, Custom);
868    setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i128, Custom);
869    setOperationAction(ISD::ATOMIC_SWAP, MVT::i128, Custom);
870  }
871
872  // 128-bit loads and stores can be done without expanding
873  setOperationAction(ISD::LOAD, MVT::i128, Custom);
874  setOperationAction(ISD::STORE, MVT::i128, Custom);
875
876  // Aligned 128-bit loads and stores are single-copy atomic according to the
877  // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
878  if (Subtarget->hasLSE2()) {
879    setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
880    setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
881  }
882
883  // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
884  // custom lowering, as there are no un-paired non-temporal stores and
885  // legalization will break up 256 bit inputs.
886  setOperationAction(ISD::STORE, MVT::v32i8, Custom);
887  setOperationAction(ISD::STORE, MVT::v16i16, Custom);
888  setOperationAction(ISD::STORE, MVT::v16f16, Custom);
889  setOperationAction(ISD::STORE, MVT::v8i32, Custom);
890  setOperationAction(ISD::STORE, MVT::v8f32, Custom);
891  setOperationAction(ISD::STORE, MVT::v4f64, Custom);
892  setOperationAction(ISD::STORE, MVT::v4i64, Custom);
893
894  // 256 bit non-temporal loads can be lowered to LDNP. This is done using
895  // custom lowering, as there are no un-paired non-temporal loads legalization
896  // will break up 256 bit inputs.
897  setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
898  setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
899  setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
900  setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
901  setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
902  setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
903  setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
904
905  // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
906  setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
907
908  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
909      getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
910    // Issue __sincos_stret if available.
911    setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
912    setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
913  } else {
914    setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
915    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
916  }
917
918  if (Subtarget->getTargetTriple().isOSMSVCRT()) {
919    // MSVCRT doesn't have powi; fall back to pow
920    setLibcallName(RTLIB::POWI_F32, nullptr);
921    setLibcallName(RTLIB::POWI_F64, nullptr);
922  }
923
924  // Make floating-point constants legal for the large code model, so they don't
925  // become loads from the constant pool.
926  if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
927    setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
928    setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
929  }
930
931  // AArch64 does not have floating-point extending loads, i1 sign-extending
932  // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
933  for (MVT VT : MVT::fp_valuetypes()) {
934    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
935    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
936    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
937    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
938  }
939  for (MVT VT : MVT::integer_valuetypes())
940    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
941
942  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
943  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
944  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
945  setTruncStoreAction(MVT::f128, MVT::f80, Expand);
946  setTruncStoreAction(MVT::f128, MVT::f64, Expand);
947  setTruncStoreAction(MVT::f128, MVT::f32, Expand);
948  setTruncStoreAction(MVT::f128, MVT::f16, Expand);
949
950  setOperationAction(ISD::BITCAST, MVT::i16, Custom);
951  setOperationAction(ISD::BITCAST, MVT::f16, Custom);
952  setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
953
954  // Indexed loads and stores are supported.
955  for (unsigned im = (unsigned)ISD::PRE_INC;
956       im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
957    setIndexedLoadAction(im, MVT::i8, Legal);
958    setIndexedLoadAction(im, MVT::i16, Legal);
959    setIndexedLoadAction(im, MVT::i32, Legal);
960    setIndexedLoadAction(im, MVT::i64, Legal);
961    setIndexedLoadAction(im, MVT::f64, Legal);
962    setIndexedLoadAction(im, MVT::f32, Legal);
963    setIndexedLoadAction(im, MVT::f16, Legal);
964    setIndexedLoadAction(im, MVT::bf16, Legal);
965    setIndexedStoreAction(im, MVT::i8, Legal);
966    setIndexedStoreAction(im, MVT::i16, Legal);
967    setIndexedStoreAction(im, MVT::i32, Legal);
968    setIndexedStoreAction(im, MVT::i64, Legal);
969    setIndexedStoreAction(im, MVT::f64, Legal);
970    setIndexedStoreAction(im, MVT::f32, Legal);
971    setIndexedStoreAction(im, MVT::f16, Legal);
972    setIndexedStoreAction(im, MVT::bf16, Legal);
973  }
974
975  // Trap.
976  setOperationAction(ISD::TRAP, MVT::Other, Legal);
977  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
978  setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
979
980  // We combine OR nodes for bitfield operations.
981  setTargetDAGCombine(ISD::OR);
982  // Try to create BICs for vector ANDs.
983  setTargetDAGCombine(ISD::AND);
984
985  // Vector add and sub nodes may conceal a high-half opportunity.
986  // Also, try to fold ADD into CSINC/CSINV..
987  setTargetDAGCombine({ISD::ADD, ISD::ABS, ISD::SUB, ISD::XOR, ISD::SINT_TO_FP,
988                       ISD::UINT_TO_FP});
989
990  setTargetDAGCombine({ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
991                       ISD::FP_TO_UINT_SAT, ISD::FADD, ISD::FDIV});
992
993  // Try and combine setcc with csel
994  setTargetDAGCombine(ISD::SETCC);
995
996  setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
997
998  setTargetDAGCombine({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND,
999                       ISD::VECTOR_SPLICE, ISD::SIGN_EXTEND_INREG,
1000                       ISD::CONCAT_VECTORS, ISD::EXTRACT_SUBVECTOR,
1001                       ISD::INSERT_SUBVECTOR, ISD::STORE, ISD::BUILD_VECTOR});
1002  setTargetDAGCombine(ISD::TRUNCATE);
1003  setTargetDAGCombine(ISD::LOAD);
1004
1005  setTargetDAGCombine(ISD::MSTORE);
1006
1007  setTargetDAGCombine(ISD::MUL);
1008
1009  setTargetDAGCombine({ISD::SELECT, ISD::VSELECT});
1010
1011  setTargetDAGCombine({ISD::INTRINSIC_VOID, ISD::INTRINSIC_W_CHAIN,
1012                       ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT,
1013                       ISD::VECREDUCE_ADD, ISD::STEP_VECTOR});
1014
1015  setTargetDAGCombine({ISD::MGATHER, ISD::MSCATTER});
1016
1017  setTargetDAGCombine(ISD::FP_EXTEND);
1018
1019  setTargetDAGCombine(ISD::GlobalAddress);
1020
1021  setTargetDAGCombine(ISD::CTLZ);
1022
1023  setTargetDAGCombine(ISD::VECREDUCE_AND);
1024  setTargetDAGCombine(ISD::VECREDUCE_OR);
1025  setTargetDAGCombine(ISD::VECREDUCE_XOR);
1026
1027  setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
1028
1029  // In case of strict alignment, avoid an excessive number of byte wide stores.
1030  MaxStoresPerMemsetOptSize = 8;
1031  MaxStoresPerMemset =
1032      Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
1033
1034  MaxGluedStoresPerMemcpy = 4;
1035  MaxStoresPerMemcpyOptSize = 4;
1036  MaxStoresPerMemcpy =
1037      Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
1038
1039  MaxStoresPerMemmoveOptSize = 4;
1040  MaxStoresPerMemmove = 4;
1041
1042  MaxLoadsPerMemcmpOptSize = 4;
1043  MaxLoadsPerMemcmp =
1044      Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
1045
1046  setStackPointerRegisterToSaveRestore(AArch64::SP);
1047
1048  setSchedulingPreference(Sched::Hybrid);
1049
1050  EnableExtLdPromotion = true;
1051
1052  // Set required alignment.
1053  setMinFunctionAlignment(Align(4));
1054  // Set preferred alignments.
1055
1056  // Don't align loops on Windows. The SEH unwind info generation needs to
1057  // know the exact length of functions before the alignments have been
1058  // expanded.
1059  if (!Subtarget->isTargetWindows())
1060    setPrefLoopAlignment(STI.getPrefLoopAlignment());
1061  setMaxBytesForAlignment(STI.getMaxBytesForLoopAlignment());
1062  setPrefFunctionAlignment(STI.getPrefFunctionAlignment());
1063
1064  // Only change the limit for entries in a jump table if specified by
1065  // the sub target, but not at the command line.
1066  unsigned MaxJT = STI.getMaximumJumpTableSize();
1067  if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1068    setMaximumJumpTableSize(MaxJT);
1069
1070  setHasExtractBitsInsn(true);
1071
1072  setMaxDivRemBitWidthSupported(128);
1073
1074  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1075
1076  if (Subtarget->hasNEON()) {
1077    // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1078    // silliness like this:
1079    for (auto Op :
1080         {ISD::SELECT,         ISD::SELECT_CC,
1081          ISD::BR_CC,          ISD::FADD,           ISD::FSUB,
1082          ISD::FMUL,           ISD::FDIV,           ISD::FMA,
1083          ISD::FNEG,           ISD::FABS,           ISD::FCEIL,
1084          ISD::FSQRT,          ISD::FFLOOR,         ISD::FNEARBYINT,
1085          ISD::FRINT,          ISD::FROUND,         ISD::FROUNDEVEN,
1086          ISD::FTRUNC,         ISD::FMINNUM,        ISD::FMAXNUM,
1087          ISD::FMINIMUM,       ISD::FMAXIMUM,       ISD::STRICT_FADD,
1088          ISD::STRICT_FSUB,    ISD::STRICT_FMUL,    ISD::STRICT_FDIV,
1089          ISD::STRICT_FMA,     ISD::STRICT_FCEIL,   ISD::STRICT_FFLOOR,
1090          ISD::STRICT_FSQRT,   ISD::STRICT_FRINT,   ISD::STRICT_FNEARBYINT,
1091          ISD::STRICT_FROUND,  ISD::STRICT_FTRUNC,  ISD::STRICT_FROUNDEVEN,
1092          ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM, ISD::STRICT_FMINIMUM,
1093          ISD::STRICT_FMAXIMUM})
1094      setOperationAction(Op, MVT::v1f64, Expand);
1095
1096    for (auto Op :
1097         {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP,
1098          ISD::FP_ROUND, ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT, ISD::MUL,
1099          ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT,
1100          ISD::STRICT_SINT_TO_FP, ISD::STRICT_UINT_TO_FP, ISD::STRICT_FP_ROUND})
1101      setOperationAction(Op, MVT::v1i64, Expand);
1102
1103    // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1104    // elements smaller than i32, so promote the input to i32 first.
1105    setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
1106    setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
1107
1108    // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1109    // Or, direct i32 -> f16 vector conversion.  Set it so custom, so the
1110    // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1111    for (auto Op : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
1112                    ISD::STRICT_UINT_TO_FP})
1113      for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1114        setOperationAction(Op, VT, Custom);
1115
1116    if (Subtarget->hasFullFP16()) {
1117      setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
1118      setOperationAction(ISD::ConstantFP, MVT::bf16, Legal);
1119
1120      setOperationAction(ISD::SINT_TO_FP, MVT::v8i8, Custom);
1121      setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
1122      setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Custom);
1123      setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom);
1124      setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
1125      setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
1126      setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom);
1127      setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
1128    } else {
1129      // when AArch64 doesn't have fullfp16 support, promote the input
1130      // to i32 first.
1131      setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1132      setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1133      setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1134      setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1135      setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1136      setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1137      setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1138      setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1139    }
1140
1141    setOperationAction(ISD::CTLZ,       MVT::v1i64, Expand);
1142    setOperationAction(ISD::CTLZ,       MVT::v2i64, Expand);
1143    setOperationAction(ISD::BITREVERSE, MVT::v8i8, Legal);
1144    setOperationAction(ISD::BITREVERSE, MVT::v16i8, Legal);
1145    setOperationAction(ISD::BITREVERSE, MVT::v2i32, Custom);
1146    setOperationAction(ISD::BITREVERSE, MVT::v4i32, Custom);
1147    setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom);
1148    setOperationAction(ISD::BITREVERSE, MVT::v2i64, Custom);
1149    for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1150      setOperationAction(ISD::UMAX, VT, Custom);
1151      setOperationAction(ISD::SMAX, VT, Custom);
1152      setOperationAction(ISD::UMIN, VT, Custom);
1153      setOperationAction(ISD::SMIN, VT, Custom);
1154    }
1155
1156    // Custom handling for some quad-vector types to detect MULL.
1157    setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1158    setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1159    setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1160    setOperationAction(ISD::MUL, MVT::v4i16, Custom);
1161    setOperationAction(ISD::MUL, MVT::v2i32, Custom);
1162    setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1163
1164    // Saturates
1165    for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1166                    MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1167      setOperationAction(ISD::SADDSAT, VT, Legal);
1168      setOperationAction(ISD::UADDSAT, VT, Legal);
1169      setOperationAction(ISD::SSUBSAT, VT, Legal);
1170      setOperationAction(ISD::USUBSAT, VT, Legal);
1171    }
1172
1173    for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1174                   MVT::v4i32}) {
1175      setOperationAction(ISD::AVGFLOORS, VT, Legal);
1176      setOperationAction(ISD::AVGFLOORU, VT, Legal);
1177      setOperationAction(ISD::AVGCEILS, VT, Legal);
1178      setOperationAction(ISD::AVGCEILU, VT, Legal);
1179      setOperationAction(ISD::ABDS, VT, Legal);
1180      setOperationAction(ISD::ABDU, VT, Legal);
1181    }
1182
1183    // Vector reductions
1184    for (MVT VT : { MVT::v4f16, MVT::v2f32,
1185                    MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1186      if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1187        setOperationAction(ISD::VECREDUCE_FMAX, VT, Legal);
1188        setOperationAction(ISD::VECREDUCE_FMIN, VT, Legal);
1189        setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Legal);
1190        setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Legal);
1191
1192        setOperationAction(ISD::VECREDUCE_FADD, VT, Legal);
1193      }
1194    }
1195    for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1196                    MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1197      setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1198      setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1199      setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1200      setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1201      setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1202      setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1203      setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1204      setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1205    }
1206    setOperationAction(ISD::VECREDUCE_ADD, MVT::v2i64, Custom);
1207    setOperationAction(ISD::VECREDUCE_AND, MVT::v2i64, Custom);
1208    setOperationAction(ISD::VECREDUCE_OR, MVT::v2i64, Custom);
1209    setOperationAction(ISD::VECREDUCE_XOR, MVT::v2i64, Custom);
1210
1211    setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
1212    setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1213    // Likewise, narrowing and extending vector loads/stores aren't handled
1214    // directly.
1215    for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
1216      setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
1217
1218      if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1219        setOperationAction(ISD::MULHS, VT, Legal);
1220        setOperationAction(ISD::MULHU, VT, Legal);
1221      } else {
1222        setOperationAction(ISD::MULHS, VT, Expand);
1223        setOperationAction(ISD::MULHU, VT, Expand);
1224      }
1225      setOperationAction(ISD::SMUL_LOHI, VT, Expand);
1226      setOperationAction(ISD::UMUL_LOHI, VT, Expand);
1227
1228      setOperationAction(ISD::BSWAP, VT, Expand);
1229      setOperationAction(ISD::CTTZ, VT, Expand);
1230
1231      for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1232        setTruncStoreAction(VT, InnerVT, Expand);
1233        setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1234        setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1235        setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1236      }
1237    }
1238
1239    // AArch64 has implementations of a lot of rounding-like FP operations.
1240    for (auto Op :
1241         {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC,
1242          ISD::FROUND, ISD::FROUNDEVEN, ISD::STRICT_FFLOOR,
1243          ISD::STRICT_FNEARBYINT, ISD::STRICT_FCEIL, ISD::STRICT_FRINT,
1244          ISD::STRICT_FTRUNC, ISD::STRICT_FROUND, ISD::STRICT_FROUNDEVEN}) {
1245      for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1246        setOperationAction(Op, Ty, Legal);
1247      if (Subtarget->hasFullFP16())
1248        for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1249          setOperationAction(Op, Ty, Legal);
1250    }
1251
1252    setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1253
1254    setOperationAction(ISD::BITCAST, MVT::i2, Custom);
1255    setOperationAction(ISD::BITCAST, MVT::i4, Custom);
1256    setOperationAction(ISD::BITCAST, MVT::i8, Custom);
1257    setOperationAction(ISD::BITCAST, MVT::i16, Custom);
1258
1259    setOperationAction(ISD::BITCAST, MVT::v2i8, Custom);
1260    setOperationAction(ISD::BITCAST, MVT::v2i16, Custom);
1261    setOperationAction(ISD::BITCAST, MVT::v4i8, Custom);
1262
1263    setLoadExtAction(ISD::EXTLOAD,  MVT::v4i16, MVT::v4i8, Custom);
1264    setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1265    setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1266    setLoadExtAction(ISD::EXTLOAD,  MVT::v4i32, MVT::v4i8, Custom);
1267    setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1268    setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1269
1270    // ADDP custom lowering
1271    for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1272      setOperationAction(ISD::ADD, VT, Custom);
1273    // FADDP custom lowering
1274    for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1275      setOperationAction(ISD::FADD, VT, Custom);
1276  }
1277
1278  if (Subtarget->hasSME()) {
1279    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1280  }
1281
1282  // FIXME: Move lowering for more nodes here if those are common between
1283  // SVE and SME.
1284  if (Subtarget->hasSVEorSME()) {
1285    for (auto VT :
1286         {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1287      setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
1288      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1289      setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom);
1290      setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom);
1291    }
1292  }
1293
1294  if (Subtarget->hasSVEorSME()) {
1295    for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1296      setOperationAction(ISD::BITREVERSE, VT, Custom);
1297      setOperationAction(ISD::BSWAP, VT, Custom);
1298      setOperationAction(ISD::CTLZ, VT, Custom);
1299      setOperationAction(ISD::CTPOP, VT, Custom);
1300      setOperationAction(ISD::CTTZ, VT, Custom);
1301      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1302      setOperationAction(ISD::UINT_TO_FP, VT, Custom);
1303      setOperationAction(ISD::SINT_TO_FP, VT, Custom);
1304      setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1305      setOperationAction(ISD::FP_TO_SINT, VT, Custom);
1306      setOperationAction(ISD::MGATHER, VT, Custom);
1307      setOperationAction(ISD::MSCATTER, VT, Custom);
1308      setOperationAction(ISD::MLOAD, VT, Custom);
1309      setOperationAction(ISD::MUL, VT, Custom);
1310      setOperationAction(ISD::MULHS, VT, Custom);
1311      setOperationAction(ISD::MULHU, VT, Custom);
1312      setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
1313      setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
1314      setOperationAction(ISD::SELECT, VT, Custom);
1315      setOperationAction(ISD::SETCC, VT, Custom);
1316      setOperationAction(ISD::SDIV, VT, Custom);
1317      setOperationAction(ISD::UDIV, VT, Custom);
1318      setOperationAction(ISD::SMIN, VT, Custom);
1319      setOperationAction(ISD::UMIN, VT, Custom);
1320      setOperationAction(ISD::SMAX, VT, Custom);
1321      setOperationAction(ISD::UMAX, VT, Custom);
1322      setOperationAction(ISD::SHL, VT, Custom);
1323      setOperationAction(ISD::SRL, VT, Custom);
1324      setOperationAction(ISD::SRA, VT, Custom);
1325      setOperationAction(ISD::ABS, VT, Custom);
1326      setOperationAction(ISD::ABDS, VT, Custom);
1327      setOperationAction(ISD::ABDU, VT, Custom);
1328      setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1329      setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1330      setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1331      setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1332      setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1333      setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1334      setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1335      setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1336      setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom);
1337      setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom);
1338
1339      setOperationAction(ISD::UMUL_LOHI, VT, Expand);
1340      setOperationAction(ISD::SMUL_LOHI, VT, Expand);
1341      setOperationAction(ISD::SELECT_CC, VT, Expand);
1342      setOperationAction(ISD::ROTL, VT, Expand);
1343      setOperationAction(ISD::ROTR, VT, Expand);
1344
1345      setOperationAction(ISD::SADDSAT, VT, Legal);
1346      setOperationAction(ISD::UADDSAT, VT, Legal);
1347      setOperationAction(ISD::SSUBSAT, VT, Legal);
1348      setOperationAction(ISD::USUBSAT, VT, Legal);
1349      setOperationAction(ISD::UREM, VT, Expand);
1350      setOperationAction(ISD::SREM, VT, Expand);
1351      setOperationAction(ISD::SDIVREM, VT, Expand);
1352      setOperationAction(ISD::UDIVREM, VT, Expand);
1353
1354      setOperationAction(ISD::AVGFLOORS, VT, Custom);
1355      setOperationAction(ISD::AVGFLOORU, VT, Custom);
1356      setOperationAction(ISD::AVGCEILS, VT, Custom);
1357      setOperationAction(ISD::AVGCEILU, VT, Custom);
1358
1359      if (!Subtarget->isLittleEndian())
1360        setOperationAction(ISD::BITCAST, VT, Expand);
1361
1362      if (Subtarget->hasSVE2orSME())
1363        // For SLI/SRI.
1364        setOperationAction(ISD::OR, VT, Custom);
1365    }
1366
1367    // Illegal unpacked integer vector types.
1368    for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1369      setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1370      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1371    }
1372
1373    // Legalize unpacked bitcasts to REINTERPRET_CAST.
1374    for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32, MVT::nxv2bf16,
1375                    MVT::nxv4bf16, MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32})
1376      setOperationAction(ISD::BITCAST, VT, Custom);
1377
1378    for (auto VT :
1379         { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1380           MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1381      setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Legal);
1382
1383    for (auto VT :
1384         {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1385      setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1386      setOperationAction(ISD::SELECT, VT, Custom);
1387      setOperationAction(ISD::SETCC, VT, Custom);
1388      setOperationAction(ISD::TRUNCATE, VT, Custom);
1389      setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1390      setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1391      setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1392
1393      setOperationAction(ISD::SELECT_CC, VT, Expand);
1394      setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1395      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1396
1397      // There are no legal MVT::nxv16f## based types.
1398      if (VT != MVT::nxv16i1) {
1399        setOperationAction(ISD::SINT_TO_FP, VT, Custom);
1400        setOperationAction(ISD::UINT_TO_FP, VT, Custom);
1401      }
1402    }
1403
1404    // NEON doesn't support masked loads/stores/gathers/scatters, but SVE does
1405    for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1406                    MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1407                    MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1408      setOperationAction(ISD::MLOAD, VT, Custom);
1409      setOperationAction(ISD::MSTORE, VT, Custom);
1410      setOperationAction(ISD::MGATHER, VT, Custom);
1411      setOperationAction(ISD::MSCATTER, VT, Custom);
1412    }
1413
1414    // Firstly, exclude all scalable vector extending loads/truncating stores,
1415    // include both integer and floating scalable vector.
1416    for (MVT VT : MVT::scalable_vector_valuetypes()) {
1417      for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1418        setTruncStoreAction(VT, InnerVT, Expand);
1419        setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1420        setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1421        setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1422      }
1423    }
1424
1425    // Then, selectively enable those which we directly support.
1426    setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1427    setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1428    setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1429    setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1430    setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1431    setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1432    for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1433      setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1434      setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1435      setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1436      setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1437      setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1438      setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1439    }
1440
1441    // SVE supports truncating stores of 64 and 128-bit vectors
1442    setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1443    setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1444    setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1445    setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1446    setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1447
1448    for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1449                    MVT::nxv4f32, MVT::nxv2f64}) {
1450      setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1451      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1452      setOperationAction(ISD::MGATHER, VT, Custom);
1453      setOperationAction(ISD::MSCATTER, VT, Custom);
1454      setOperationAction(ISD::MLOAD, VT, Custom);
1455      setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
1456      setOperationAction(ISD::SELECT, VT, Custom);
1457      setOperationAction(ISD::SETCC, VT, Custom);
1458      setOperationAction(ISD::FADD, VT, Custom);
1459      setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1460      setOperationAction(ISD::FDIV, VT, Custom);
1461      setOperationAction(ISD::FMA, VT, Custom);
1462      setOperationAction(ISD::FMAXIMUM, VT, Custom);
1463      setOperationAction(ISD::FMAXNUM, VT, Custom);
1464      setOperationAction(ISD::FMINIMUM, VT, Custom);
1465      setOperationAction(ISD::FMINNUM, VT, Custom);
1466      setOperationAction(ISD::FMUL, VT, Custom);
1467      setOperationAction(ISD::FNEG, VT, Custom);
1468      setOperationAction(ISD::FSUB, VT, Custom);
1469      setOperationAction(ISD::FCEIL, VT, Custom);
1470      setOperationAction(ISD::FFLOOR, VT, Custom);
1471      setOperationAction(ISD::FNEARBYINT, VT, Custom);
1472      setOperationAction(ISD::FRINT, VT, Custom);
1473      setOperationAction(ISD::FROUND, VT, Custom);
1474      setOperationAction(ISD::FROUNDEVEN, VT, Custom);
1475      setOperationAction(ISD::FTRUNC, VT, Custom);
1476      setOperationAction(ISD::FSQRT, VT, Custom);
1477      setOperationAction(ISD::FABS, VT, Custom);
1478      setOperationAction(ISD::FP_EXTEND, VT, Custom);
1479      setOperationAction(ISD::FP_ROUND, VT, Custom);
1480      setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1481      setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
1482      setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
1483      setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Custom);
1484      setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Custom);
1485      if (Subtarget->isSVEAvailable())
1486        setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
1487      setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
1488      setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom);
1489      setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom);
1490
1491      setOperationAction(ISD::SELECT_CC, VT, Expand);
1492      setOperationAction(ISD::FREM, VT, Expand);
1493      setOperationAction(ISD::FPOW, VT, Expand);
1494      setOperationAction(ISD::FPOWI, VT, Expand);
1495      setOperationAction(ISD::FCOS, VT, Expand);
1496      setOperationAction(ISD::FSIN, VT, Expand);
1497      setOperationAction(ISD::FSINCOS, VT, Expand);
1498      setOperationAction(ISD::FEXP, VT, Expand);
1499      setOperationAction(ISD::FEXP2, VT, Expand);
1500      setOperationAction(ISD::FEXP10, VT, Expand);
1501      setOperationAction(ISD::FLOG, VT, Expand);
1502      setOperationAction(ISD::FLOG2, VT, Expand);
1503      setOperationAction(ISD::FLOG10, VT, Expand);
1504
1505      setCondCodeAction(ISD::SETO, VT, Expand);
1506      setCondCodeAction(ISD::SETOLT, VT, Expand);
1507      setCondCodeAction(ISD::SETLT, VT, Expand);
1508      setCondCodeAction(ISD::SETOLE, VT, Expand);
1509      setCondCodeAction(ISD::SETLE, VT, Expand);
1510      setCondCodeAction(ISD::SETULT, VT, Expand);
1511      setCondCodeAction(ISD::SETULE, VT, Expand);
1512      setCondCodeAction(ISD::SETUGE, VT, Expand);
1513      setCondCodeAction(ISD::SETUGT, VT, Expand);
1514      setCondCodeAction(ISD::SETUEQ, VT, Expand);
1515      setCondCodeAction(ISD::SETONE, VT, Expand);
1516
1517      if (!Subtarget->isLittleEndian())
1518        setOperationAction(ISD::BITCAST, VT, Expand);
1519    }
1520
1521    for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1522      setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1523      setOperationAction(ISD::MGATHER, VT, Custom);
1524      setOperationAction(ISD::MSCATTER, VT, Custom);
1525      setOperationAction(ISD::MLOAD, VT, Custom);
1526      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1527      setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
1528
1529      if (!Subtarget->isLittleEndian())
1530        setOperationAction(ISD::BITCAST, VT, Expand);
1531    }
1532
1533    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
1534    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
1535
1536    // NEON doesn't support integer divides, but SVE does
1537    for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1538                    MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1539      setOperationAction(ISD::SDIV, VT, Custom);
1540      setOperationAction(ISD::UDIV, VT, Custom);
1541    }
1542
1543    // NEON doesn't support 64-bit vector integer muls, but SVE does.
1544    setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1545    setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1546
1547    if (Subtarget->isSVEAvailable()) {
1548      // NEON doesn't support across-vector reductions, but SVE does.
1549      for (auto VT :
1550           {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64})
1551        setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
1552    }
1553
1554    if (!Subtarget->isNeonAvailable()) {
1555      setTruncStoreAction(MVT::v2f32, MVT::v2f16, Custom);
1556      setTruncStoreAction(MVT::v4f32, MVT::v4f16, Custom);
1557      setTruncStoreAction(MVT::v8f32, MVT::v8f16, Custom);
1558      setTruncStoreAction(MVT::v1f64, MVT::v1f16, Custom);
1559      setTruncStoreAction(MVT::v2f64, MVT::v2f16, Custom);
1560      setTruncStoreAction(MVT::v4f64, MVT::v4f16, Custom);
1561      setTruncStoreAction(MVT::v1f64, MVT::v1f32, Custom);
1562      setTruncStoreAction(MVT::v2f64, MVT::v2f32, Custom);
1563      setTruncStoreAction(MVT::v4f64, MVT::v4f32, Custom);
1564      for (MVT VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1565                     MVT::v4i32, MVT::v1i64, MVT::v2i64})
1566        addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/ true);
1567
1568      for (MVT VT :
1569           {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64})
1570        addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/ true);
1571    }
1572
1573    // NOTE: Currently this has to happen after computeRegisterProperties rather
1574    // than the preferred option of combining it with the addRegisterClass call.
1575    if (Subtarget->useSVEForFixedLengthVectors()) {
1576      for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
1577        if (useSVEForFixedLengthVectorVT(VT))
1578          addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/ false);
1579      for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
1580        if (useSVEForFixedLengthVectorVT(VT))
1581          addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/ false);
1582
1583      // 64bit results can mean a bigger than NEON input.
1584      for (auto VT : {MVT::v8i8, MVT::v4i16})
1585        setOperationAction(ISD::TRUNCATE, VT, Custom);
1586      setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom);
1587
1588      // 128bit results imply a bigger than NEON input.
1589      for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1590        setOperationAction(ISD::TRUNCATE, VT, Custom);
1591      for (auto VT : {MVT::v8f16, MVT::v4f32})
1592        setOperationAction(ISD::FP_ROUND, VT, Custom);
1593
1594      // These operations are not supported on NEON but SVE can do them.
1595      setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom);
1596      setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1597      setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1598      setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1599      setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
1600      setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
1601      setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
1602      setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
1603      setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1604      setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1605      setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1606      setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1607      setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1608      setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1609      setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1610      setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1611      setOperationAction(ISD::VECREDUCE_SMAX, MVT::v2i64, Custom);
1612      setOperationAction(ISD::VECREDUCE_SMIN, MVT::v2i64, Custom);
1613      setOperationAction(ISD::VECREDUCE_UMAX, MVT::v2i64, Custom);
1614      setOperationAction(ISD::VECREDUCE_UMIN, MVT::v2i64, Custom);
1615
1616      // Int operations with no NEON support.
1617      for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1618                      MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1619        setOperationAction(ISD::BITREVERSE, VT, Custom);
1620        setOperationAction(ISD::CTTZ, VT, Custom);
1621        setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1622        setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1623        setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1624        setOperationAction(ISD::MULHS, VT, Custom);
1625        setOperationAction(ISD::MULHU, VT, Custom);
1626      }
1627
1628
1629      // Use SVE for vectors with more than 2 elements.
1630      for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1631        setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1632    }
1633
1634    setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
1635    setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
1636    setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
1637    setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
1638
1639    setOperationAction(ISD::VSCALE, MVT::i32, Custom);
1640  }
1641
1642  if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1643    // Only required for llvm.aarch64.mops.memset.tag
1644    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
1645  }
1646
1647  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1648
1649  if (Subtarget->hasSVE()) {
1650    setOperationAction(ISD::FLDEXP, MVT::f64, Custom);
1651    setOperationAction(ISD::FLDEXP, MVT::f32, Custom);
1652    setOperationAction(ISD::FLDEXP, MVT::f16, Custom);
1653  }
1654
1655  PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
1656
1657  IsStrictFPEnabled = true;
1658  setMaxAtomicSizeInBitsSupported(128);
1659
1660  if (Subtarget->isWindowsArm64EC()) {
1661    // FIXME: are there intrinsics we need to exclude from this?
1662    for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) {
1663      auto code = static_cast<RTLIB::Libcall>(i);
1664      auto libcallName = getLibcallName(code);
1665      if ((libcallName != nullptr) && (libcallName[0] != '#')) {
1666        setLibcallName(code, Saver.save(Twine("#") + libcallName).data());
1667      }
1668    }
1669  }
1670}
1671
1672void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1673  assert(VT.isVector() && "VT should be a vector type");
1674
1675  if (VT.isFloatingPoint()) {
1676    MVT PromoteTo = EVT(VT).changeVectorElementTypeToInteger().getSimpleVT();
1677    setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
1678    setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
1679  }
1680
1681  // Mark vector float intrinsics as expand.
1682  if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1683    setOperationAction(ISD::FSIN, VT, Expand);
1684    setOperationAction(ISD::FCOS, VT, Expand);
1685    setOperationAction(ISD::FPOW, VT, Expand);
1686    setOperationAction(ISD::FLOG, VT, Expand);
1687    setOperationAction(ISD::FLOG2, VT, Expand);
1688    setOperationAction(ISD::FLOG10, VT, Expand);
1689    setOperationAction(ISD::FEXP, VT, Expand);
1690    setOperationAction(ISD::FEXP2, VT, Expand);
1691    setOperationAction(ISD::FEXP10, VT, Expand);
1692  }
1693
1694  // But we do support custom-lowering for FCOPYSIGN.
1695  if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
1696      ((VT == MVT::v4f16 || VT == MVT::v8f16) && Subtarget->hasFullFP16()))
1697    setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1698
1699  setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1700  setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1701  setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1702  setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1703  setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1704  setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1705  setOperationAction(ISD::SRA, VT, Custom);
1706  setOperationAction(ISD::SRL, VT, Custom);
1707  setOperationAction(ISD::SHL, VT, Custom);
1708  setOperationAction(ISD::OR, VT, Custom);
1709  setOperationAction(ISD::SETCC, VT, Custom);
1710  setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
1711
1712  setOperationAction(ISD::SELECT, VT, Expand);
1713  setOperationAction(ISD::SELECT_CC, VT, Expand);
1714  setOperationAction(ISD::VSELECT, VT, Expand);
1715  for (MVT InnerVT : MVT::all_valuetypes())
1716    setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1717
1718  // CNT supports only B element sizes, then use UADDLP to widen.
1719  if (VT != MVT::v8i8 && VT != MVT::v16i8)
1720    setOperationAction(ISD::CTPOP, VT, Custom);
1721
1722  setOperationAction(ISD::UDIV, VT, Expand);
1723  setOperationAction(ISD::SDIV, VT, Expand);
1724  setOperationAction(ISD::UREM, VT, Expand);
1725  setOperationAction(ISD::SREM, VT, Expand);
1726  setOperationAction(ISD::FREM, VT, Expand);
1727
1728  for (unsigned Opcode :
1729       {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
1730        ISD::FP_TO_UINT_SAT, ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT})
1731    setOperationAction(Opcode, VT, Custom);
1732
1733  if (!VT.isFloatingPoint())
1734    setOperationAction(ISD::ABS, VT, Legal);
1735
1736  // [SU][MIN|MAX] are available for all NEON types apart from i64.
1737  if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1738    for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1739      setOperationAction(Opcode, VT, Legal);
1740
1741  // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
1742  // NEON types.
1743  if (VT.isFloatingPoint() &&
1744      VT.getVectorElementType() != MVT::bf16 &&
1745      (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1746    for (unsigned Opcode :
1747         {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM,
1748          ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM, ISD::STRICT_FMINNUM,
1749          ISD::STRICT_FMAXNUM, ISD::STRICT_FADD, ISD::STRICT_FSUB,
1750          ISD::STRICT_FMUL, ISD::STRICT_FDIV, ISD::STRICT_FMA,
1751          ISD::STRICT_FSQRT})
1752      setOperationAction(Opcode, VT, Legal);
1753
1754  // Strict fp extend and trunc are legal
1755  if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
1756    setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal);
1757  if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
1758    setOperationAction(ISD::STRICT_FP_ROUND, VT, Legal);
1759
1760  // FIXME: We could potentially make use of the vector comparison instructions
1761  // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
1762  // complications:
1763  //  * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
1764  //    so we would need to expand when the condition code doesn't match the
1765  //    kind of comparison.
1766  //  * Some kinds of comparison require more than one FCMXY instruction so
1767  //    would need to be expanded instead.
1768  //  * The lowering of the non-strict versions involves target-specific ISD
1769  //    nodes so we would likely need to add strict versions of all of them and
1770  //    handle them appropriately.
1771  setOperationAction(ISD::STRICT_FSETCC, VT, Expand);
1772  setOperationAction(ISD::STRICT_FSETCCS, VT, Expand);
1773
1774  if (Subtarget->isLittleEndian()) {
1775    for (unsigned im = (unsigned)ISD::PRE_INC;
1776         im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
1777      setIndexedLoadAction(im, VT, Legal);
1778      setIndexedStoreAction(im, VT, Legal);
1779    }
1780  }
1781
1782  if (Subtarget->hasD128()) {
1783    setOperationAction(ISD::READ_REGISTER, MVT::i128, Custom);
1784    setOperationAction(ISD::WRITE_REGISTER, MVT::i128, Custom);
1785  }
1786}
1787
1788bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
1789                                                          EVT OpVT) const {
1790  // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
1791  if (!Subtarget->hasSVE())
1792    return true;
1793
1794  // We can only support legal predicate result types. We can use the SVE
1795  // whilelo instruction for generating fixed-width predicates too.
1796  if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
1797      ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 &&
1798      ResVT != MVT::v8i1 && ResVT != MVT::v16i1)
1799    return true;
1800
1801  // The whilelo instruction only works with i32 or i64 scalar inputs.
1802  if (OpVT != MVT::i32 && OpVT != MVT::i64)
1803    return true;
1804
1805  return false;
1806}
1807
1808bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const {
1809  return !Subtarget->hasSVEorSME() || VT != MVT::nxv16i1;
1810}
1811
1812void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT,
1813                                                     bool StreamingSVE) {
1814  assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
1815
1816  // By default everything must be expanded.
1817  for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1818    setOperationAction(Op, VT, Expand);
1819
1820  if (VT.isFloatingPoint()) {
1821    setCondCodeAction(ISD::SETO, VT, Expand);
1822    setCondCodeAction(ISD::SETOLT, VT, Expand);
1823    setCondCodeAction(ISD::SETOLE, VT, Expand);
1824    setCondCodeAction(ISD::SETULT, VT, Expand);
1825    setCondCodeAction(ISD::SETULE, VT, Expand);
1826    setCondCodeAction(ISD::SETUGE, VT, Expand);
1827    setCondCodeAction(ISD::SETUGT, VT, Expand);
1828    setCondCodeAction(ISD::SETUEQ, VT, Expand);
1829    setCondCodeAction(ISD::SETONE, VT, Expand);
1830  }
1831
1832  // Mark integer truncating stores/extending loads as having custom lowering
1833  if (VT.isInteger()) {
1834    MVT InnerVT = VT.changeVectorElementType(MVT::i8);
1835    while (InnerVT != VT) {
1836      setTruncStoreAction(VT, InnerVT, Custom);
1837      setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Custom);
1838      setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Custom);
1839      InnerVT = InnerVT.changeVectorElementType(
1840          MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
1841    }
1842  }
1843
1844  // Mark floating-point truncating stores/extending loads as having custom
1845  // lowering
1846  if (VT.isFloatingPoint()) {
1847    MVT InnerVT = VT.changeVectorElementType(MVT::f16);
1848    while (InnerVT != VT) {
1849      setTruncStoreAction(VT, InnerVT, Custom);
1850      setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Custom);
1851      InnerVT = InnerVT.changeVectorElementType(
1852          MVT::getFloatingPointVT(2 * InnerVT.getScalarSizeInBits()));
1853    }
1854  }
1855
1856  // Lower fixed length vector operations to scalable equivalents.
1857  setOperationAction(ISD::ABS, VT, Custom);
1858  setOperationAction(ISD::ADD, VT, Custom);
1859  setOperationAction(ISD::AND, VT, Custom);
1860  setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1861  setOperationAction(ISD::BITCAST, VT, StreamingSVE ? Legal : Custom);
1862  setOperationAction(ISD::BITREVERSE, VT, Custom);
1863  setOperationAction(ISD::BSWAP, VT, Custom);
1864  setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1865  setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1866  setOperationAction(ISD::CTLZ, VT, Custom);
1867  setOperationAction(ISD::CTPOP, VT, Custom);
1868  setOperationAction(ISD::CTTZ, VT, Custom);
1869  setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1870  setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1871  setOperationAction(ISD::FABS, VT, Custom);
1872  setOperationAction(ISD::FADD, VT, Custom);
1873  setOperationAction(ISD::FCEIL, VT, Custom);
1874  setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1875  setOperationAction(ISD::FDIV, VT, Custom);
1876  setOperationAction(ISD::FFLOOR, VT, Custom);
1877  setOperationAction(ISD::FMA, VT, Custom);
1878  setOperationAction(ISD::FMAXIMUM, VT, Custom);
1879  setOperationAction(ISD::FMAXNUM, VT, Custom);
1880  setOperationAction(ISD::FMINIMUM, VT, Custom);
1881  setOperationAction(ISD::FMINNUM, VT, Custom);
1882  setOperationAction(ISD::FMUL, VT, Custom);
1883  setOperationAction(ISD::FNEARBYINT, VT, Custom);
1884  setOperationAction(ISD::FNEG, VT, Custom);
1885  setOperationAction(ISD::FP_EXTEND, VT, Custom);
1886  setOperationAction(ISD::FP_ROUND, VT, Custom);
1887  setOperationAction(ISD::FP_TO_SINT, VT, Custom);
1888  setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1889  setOperationAction(ISD::FRINT, VT, Custom);
1890  setOperationAction(ISD::FROUND, VT, Custom);
1891  setOperationAction(ISD::FROUNDEVEN, VT, Custom);
1892  setOperationAction(ISD::FSQRT, VT, Custom);
1893  setOperationAction(ISD::FSUB, VT, Custom);
1894  setOperationAction(ISD::FTRUNC, VT, Custom);
1895  setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1896  setOperationAction(ISD::LOAD, VT, StreamingSVE ? Legal : Custom);
1897  setOperationAction(ISD::MGATHER, VT, StreamingSVE ? Expand : Custom);
1898  setOperationAction(ISD::MLOAD, VT, Custom);
1899  setOperationAction(ISD::MSCATTER, VT, StreamingSVE ? Expand : Custom);
1900  setOperationAction(ISD::MSTORE, VT, Custom);
1901  setOperationAction(ISD::MUL, VT, Custom);
1902  setOperationAction(ISD::MULHS, VT, Custom);
1903  setOperationAction(ISD::MULHU, VT, Custom);
1904  setOperationAction(ISD::OR, VT, Custom);
1905  setOperationAction(ISD::SCALAR_TO_VECTOR, VT, StreamingSVE ? Legal : Expand);
1906  setOperationAction(ISD::SDIV, VT, Custom);
1907  setOperationAction(ISD::SELECT, VT, Custom);
1908  setOperationAction(ISD::SETCC, VT, Custom);
1909  setOperationAction(ISD::SHL, VT, Custom);
1910  setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1911  setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
1912  setOperationAction(ISD::SINT_TO_FP, VT, Custom);
1913  setOperationAction(ISD::SMAX, VT, Custom);
1914  setOperationAction(ISD::SMIN, VT, Custom);
1915  setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
1916  setOperationAction(ISD::SRA, VT, Custom);
1917  setOperationAction(ISD::SRL, VT, Custom);
1918  setOperationAction(ISD::STORE, VT, StreamingSVE ? Legal : Custom);
1919  setOperationAction(ISD::SUB, VT, Custom);
1920  setOperationAction(ISD::TRUNCATE, VT, Custom);
1921  setOperationAction(ISD::UDIV, VT, Custom);
1922  setOperationAction(ISD::UINT_TO_FP, VT, Custom);
1923  setOperationAction(ISD::UMAX, VT, Custom);
1924  setOperationAction(ISD::UMIN, VT, Custom);
1925  setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1926  setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1927  setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1928  setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
1929  setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
1930  setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Custom);
1931  setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Custom);
1932  setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1933  setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT,
1934                     StreamingSVE ? Expand : Custom);
1935  setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1936  setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1937  setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1938  setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1939  setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1940  setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1941  setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
1942  setOperationAction(ISD::VSELECT, VT, Custom);
1943  setOperationAction(ISD::XOR, VT, Custom);
1944  setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1945}
1946
1947void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
1948  addRegisterClass(VT, &AArch64::FPR64RegClass);
1949  addTypeForNEON(VT);
1950}
1951
1952void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
1953  addRegisterClass(VT, &AArch64::FPR128RegClass);
1954  addTypeForNEON(VT);
1955}
1956
1957EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &,
1958                                              LLVMContext &C, EVT VT) const {
1959  if (!VT.isVector())
1960    return MVT::i32;
1961  if (VT.isScalableVector())
1962    return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
1963  return VT.changeVectorElementTypeToInteger();
1964}
1965
1966// isIntImmediate - This method tests to see if the node is a constant
1967// operand. If so Imm will receive the value.
1968static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
1969  if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
1970    Imm = C->getZExtValue();
1971    return true;
1972  }
1973  return false;
1974}
1975
1976// isOpcWithIntImmediate - This method tests to see if the node is a specific
1977// opcode and that it has a immediate integer right operand.
1978// If so Imm will receive the value.
1979static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
1980                                  uint64_t &Imm) {
1981  return N->getOpcode() == Opc &&
1982         isIntImmediate(N->getOperand(1).getNode(), Imm);
1983}
1984
1985static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
1986                               const APInt &Demanded,
1987                               TargetLowering::TargetLoweringOpt &TLO,
1988                               unsigned NewOpc) {
1989  uint64_t OldImm = Imm, NewImm, Enc;
1990  uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
1991
1992  // Return if the immediate is already all zeros, all ones, a bimm32 or a
1993  // bimm64.
1994  if (Imm == 0 || Imm == Mask ||
1995      AArch64_AM::isLogicalImmediate(Imm & Mask, Size))
1996    return false;
1997
1998  unsigned EltSize = Size;
1999  uint64_t DemandedBits = Demanded.getZExtValue();
2000
2001  // Clear bits that are not demanded.
2002  Imm &= DemandedBits;
2003
2004  while (true) {
2005    // The goal here is to set the non-demanded bits in a way that minimizes
2006    // the number of switching between 0 and 1. In order to achieve this goal,
2007    // we set the non-demanded bits to the value of the preceding demanded bits.
2008    // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2009    // non-demanded bit), we copy bit0 (1) to the least significant 'x',
2010    // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2011    // The final result is 0b11000011.
2012    uint64_t NonDemandedBits = ~DemandedBits;
2013    uint64_t InvertedImm = ~Imm & DemandedBits;
2014    uint64_t RotatedImm =
2015        ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2016        NonDemandedBits;
2017    uint64_t Sum = RotatedImm + NonDemandedBits;
2018    bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2019    uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2020    NewImm = (Imm | Ones) & Mask;
2021
2022    // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2023    // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2024    // we halve the element size and continue the search.
2025    if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
2026      break;
2027
2028    // We cannot shrink the element size any further if it is 2-bits.
2029    if (EltSize == 2)
2030      return false;
2031
2032    EltSize /= 2;
2033    Mask >>= EltSize;
2034    uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2035
2036    // Return if there is mismatch in any of the demanded bits of Imm and Hi.
2037    if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
2038      return false;
2039
2040    // Merge the upper and lower halves of Imm and DemandedBits.
2041    Imm |= Hi;
2042    DemandedBits |= DemandedBitsHi;
2043  }
2044
2045  ++NumOptimizedImms;
2046
2047  // Replicate the element across the register width.
2048  while (EltSize < Size) {
2049    NewImm |= NewImm << EltSize;
2050    EltSize *= 2;
2051  }
2052
2053  (void)OldImm;
2054  assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
2055         "demanded bits should never be altered");
2056  assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2057
2058  // Create the new constant immediate node.
2059  EVT VT = Op.getValueType();
2060  SDLoc DL(Op);
2061  SDValue New;
2062
2063  // If the new constant immediate is all-zeros or all-ones, let the target
2064  // independent DAG combine optimize this node.
2065  if (NewImm == 0 || NewImm == OrigMask) {
2066    New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
2067                          TLO.DAG.getConstant(NewImm, DL, VT));
2068  // Otherwise, create a machine node so that target independent DAG combine
2069  // doesn't undo this optimization.
2070  } else {
2071    Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
2072    SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
2073    New = SDValue(
2074        TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
2075  }
2076
2077  return TLO.CombineTo(Op, New);
2078}
2079
2080bool AArch64TargetLowering::targetShrinkDemandedConstant(
2081    SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2082    TargetLoweringOpt &TLO) const {
2083  // Delay this optimization to as late as possible.
2084  if (!TLO.LegalOps)
2085    return false;
2086
2087  if (!EnableOptimizeLogicalImm)
2088    return false;
2089
2090  EVT VT = Op.getValueType();
2091  if (VT.isVector())
2092    return false;
2093
2094  unsigned Size = VT.getSizeInBits();
2095  assert((Size == 32 || Size == 64) &&
2096         "i32 or i64 is expected after legalization.");
2097
2098  // Exit early if we demand all bits.
2099  if (DemandedBits.popcount() == Size)
2100    return false;
2101
2102  unsigned NewOpc;
2103  switch (Op.getOpcode()) {
2104  default:
2105    return false;
2106  case ISD::AND:
2107    NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2108    break;
2109  case ISD::OR:
2110    NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2111    break;
2112  case ISD::XOR:
2113    NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2114    break;
2115  }
2116  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
2117  if (!C)
2118    return false;
2119  uint64_t Imm = C->getZExtValue();
2120  return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
2121}
2122
2123/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2124/// Mask are known to be either zero or one and return them Known.
2125void AArch64TargetLowering::computeKnownBitsForTargetNode(
2126    const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2127    const SelectionDAG &DAG, unsigned Depth) const {
2128  switch (Op.getOpcode()) {
2129  default:
2130    break;
2131  case AArch64ISD::DUP: {
2132    SDValue SrcOp = Op.getOperand(0);
2133    Known = DAG.computeKnownBits(SrcOp, Depth + 1);
2134    if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2135      assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2136             "Expected DUP implicit truncation");
2137      Known = Known.trunc(Op.getScalarValueSizeInBits());
2138    }
2139    break;
2140  }
2141  case AArch64ISD::CSEL: {
2142    KnownBits Known2;
2143    Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2144    Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2145    Known = Known.intersectWith(Known2);
2146    break;
2147  }
2148  case AArch64ISD::BICi: {
2149    // Compute the bit cleared value.
2150    uint64_t Mask =
2151        ~(Op->getConstantOperandVal(1) << Op->getConstantOperandVal(2));
2152    Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2153    Known &= KnownBits::makeConstant(APInt(Known.getBitWidth(), Mask));
2154    break;
2155  }
2156  case AArch64ISD::VLSHR: {
2157    KnownBits Known2;
2158    Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2159    Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2160    Known = KnownBits::lshr(Known, Known2);
2161    break;
2162  }
2163  case AArch64ISD::VASHR: {
2164    KnownBits Known2;
2165    Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2166    Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2167    Known = KnownBits::ashr(Known, Known2);
2168    break;
2169  }
2170  case AArch64ISD::VSHL: {
2171    KnownBits Known2;
2172    Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2173    Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2174    Known = KnownBits::shl(Known, Known2);
2175    break;
2176  }
2177  case AArch64ISD::MOVI: {
2178    Known = KnownBits::makeConstant(
2179        APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)));
2180    break;
2181  }
2182  case AArch64ISD::LOADgot:
2183  case AArch64ISD::ADDlow: {
2184    if (!Subtarget->isTargetILP32())
2185      break;
2186    // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2187    Known.Zero = APInt::getHighBitsSet(64, 32);
2188    break;
2189  }
2190  case AArch64ISD::ASSERT_ZEXT_BOOL: {
2191    Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2192    Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
2193    break;
2194  }
2195  case ISD::INTRINSIC_W_CHAIN: {
2196    Intrinsic::ID IntID =
2197        static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
2198    switch (IntID) {
2199    default: return;
2200    case Intrinsic::aarch64_ldaxr:
2201    case Intrinsic::aarch64_ldxr: {
2202      unsigned BitWidth = Known.getBitWidth();
2203      EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
2204      unsigned MemBits = VT.getScalarSizeInBits();
2205      Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
2206      return;
2207    }
2208    }
2209    break;
2210  }
2211  case ISD::INTRINSIC_WO_CHAIN:
2212  case ISD::INTRINSIC_VOID: {
2213    unsigned IntNo = Op.getConstantOperandVal(0);
2214    switch (IntNo) {
2215    default:
2216      break;
2217    case Intrinsic::aarch64_neon_uaddlv: {
2218      MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2219      unsigned BitWidth = Known.getBitWidth();
2220      if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2221        unsigned Bound = (VT == MVT::v8i8) ?  11 : 12;
2222        assert(BitWidth >= Bound && "Unexpected width!");
2223        APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - Bound);
2224        Known.Zero |= Mask;
2225      }
2226      break;
2227    }
2228    case Intrinsic::aarch64_neon_umaxv:
2229    case Intrinsic::aarch64_neon_uminv: {
2230      // Figure out the datatype of the vector operand. The UMINV instruction
2231      // will zero extend the result, so we can mark as known zero all the
2232      // bits larger than the element datatype. 32-bit or larget doesn't need
2233      // this as those are legal types and will be handled by isel directly.
2234      MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2235      unsigned BitWidth = Known.getBitWidth();
2236      if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2237        assert(BitWidth >= 8 && "Unexpected width!");
2238        APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
2239        Known.Zero |= Mask;
2240      } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2241        assert(BitWidth >= 16 && "Unexpected width!");
2242        APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
2243        Known.Zero |= Mask;
2244      }
2245      break;
2246    } break;
2247    }
2248  }
2249  }
2250}
2251
2252unsigned AArch64TargetLowering::ComputeNumSignBitsForTargetNode(
2253    SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2254    unsigned Depth) const {
2255  EVT VT = Op.getValueType();
2256  unsigned VTBits = VT.getScalarSizeInBits();
2257  unsigned Opcode = Op.getOpcode();
2258  switch (Opcode) {
2259    case AArch64ISD::CMEQ:
2260    case AArch64ISD::CMGE:
2261    case AArch64ISD::CMGT:
2262    case AArch64ISD::CMHI:
2263    case AArch64ISD::CMHS:
2264    case AArch64ISD::FCMEQ:
2265    case AArch64ISD::FCMGE:
2266    case AArch64ISD::FCMGT:
2267    case AArch64ISD::CMEQz:
2268    case AArch64ISD::CMGEz:
2269    case AArch64ISD::CMGTz:
2270    case AArch64ISD::CMLEz:
2271    case AArch64ISD::CMLTz:
2272    case AArch64ISD::FCMEQz:
2273    case AArch64ISD::FCMGEz:
2274    case AArch64ISD::FCMGTz:
2275    case AArch64ISD::FCMLEz:
2276    case AArch64ISD::FCMLTz:
2277      // Compares return either 0 or all-ones
2278      return VTBits;
2279  }
2280
2281  return 1;
2282}
2283
2284MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
2285                                                  EVT) const {
2286  return MVT::i64;
2287}
2288
2289bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
2290    EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2291    unsigned *Fast) const {
2292  if (Subtarget->requiresStrictAlign())
2293    return false;
2294
2295  if (Fast) {
2296    // Some CPUs are fine with unaligned stores except for 128-bit ones.
2297    *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2298            // See comments in performSTORECombine() for more details about
2299            // these conditions.
2300
2301            // Code that uses clang vector extensions can mark that it
2302            // wants unaligned accesses to be treated as fast by
2303            // underspecifying alignment to be 1 or 2.
2304            Alignment <= 2 ||
2305
2306            // Disregard v2i64. Memcpy lowering produces those and splitting
2307            // them regresses performance on micro-benchmarks and olden/bh.
2308            VT == MVT::v2i64;
2309  }
2310  return true;
2311}
2312
2313// Same as above but handling LLTs instead.
2314bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
2315    LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2316    unsigned *Fast) const {
2317  if (Subtarget->requiresStrictAlign())
2318    return false;
2319
2320  if (Fast) {
2321    // Some CPUs are fine with unaligned stores except for 128-bit ones.
2322    *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2323            Ty.getSizeInBytes() != 16 ||
2324            // See comments in performSTORECombine() for more details about
2325            // these conditions.
2326
2327            // Code that uses clang vector extensions can mark that it
2328            // wants unaligned accesses to be treated as fast by
2329            // underspecifying alignment to be 1 or 2.
2330            Alignment <= 2 ||
2331
2332            // Disregard v2i64. Memcpy lowering produces those and splitting
2333            // them regresses performance on micro-benchmarks and olden/bh.
2334            Ty == LLT::fixed_vector(2, 64);
2335  }
2336  return true;
2337}
2338
2339FastISel *
2340AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
2341                                      const TargetLibraryInfo *libInfo) const {
2342  return AArch64::createFastISel(funcInfo, libInfo);
2343}
2344
2345const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
2346#define MAKE_CASE(V)                                                           \
2347  case V:                                                                      \
2348    return #V;
2349  switch ((AArch64ISD::NodeType)Opcode) {
2350  case AArch64ISD::FIRST_NUMBER:
2351    break;
2352    MAKE_CASE(AArch64ISD::COALESCER_BARRIER)
2353    MAKE_CASE(AArch64ISD::SMSTART)
2354    MAKE_CASE(AArch64ISD::SMSTOP)
2355    MAKE_CASE(AArch64ISD::RESTORE_ZA)
2356    MAKE_CASE(AArch64ISD::RESTORE_ZT)
2357    MAKE_CASE(AArch64ISD::SAVE_ZT)
2358    MAKE_CASE(AArch64ISD::CALL)
2359    MAKE_CASE(AArch64ISD::ADRP)
2360    MAKE_CASE(AArch64ISD::ADR)
2361    MAKE_CASE(AArch64ISD::ADDlow)
2362    MAKE_CASE(AArch64ISD::LOADgot)
2363    MAKE_CASE(AArch64ISD::RET_GLUE)
2364    MAKE_CASE(AArch64ISD::BRCOND)
2365    MAKE_CASE(AArch64ISD::CSEL)
2366    MAKE_CASE(AArch64ISD::CSINV)
2367    MAKE_CASE(AArch64ISD::CSNEG)
2368    MAKE_CASE(AArch64ISD::CSINC)
2369    MAKE_CASE(AArch64ISD::THREAD_POINTER)
2370    MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ)
2371    MAKE_CASE(AArch64ISD::PROBED_ALLOCA)
2372    MAKE_CASE(AArch64ISD::ABDS_PRED)
2373    MAKE_CASE(AArch64ISD::ABDU_PRED)
2374    MAKE_CASE(AArch64ISD::HADDS_PRED)
2375    MAKE_CASE(AArch64ISD::HADDU_PRED)
2376    MAKE_CASE(AArch64ISD::MUL_PRED)
2377    MAKE_CASE(AArch64ISD::MULHS_PRED)
2378    MAKE_CASE(AArch64ISD::MULHU_PRED)
2379    MAKE_CASE(AArch64ISD::RHADDS_PRED)
2380    MAKE_CASE(AArch64ISD::RHADDU_PRED)
2381    MAKE_CASE(AArch64ISD::SDIV_PRED)
2382    MAKE_CASE(AArch64ISD::SHL_PRED)
2383    MAKE_CASE(AArch64ISD::SMAX_PRED)
2384    MAKE_CASE(AArch64ISD::SMIN_PRED)
2385    MAKE_CASE(AArch64ISD::SRA_PRED)
2386    MAKE_CASE(AArch64ISD::SRL_PRED)
2387    MAKE_CASE(AArch64ISD::UDIV_PRED)
2388    MAKE_CASE(AArch64ISD::UMAX_PRED)
2389    MAKE_CASE(AArch64ISD::UMIN_PRED)
2390    MAKE_CASE(AArch64ISD::SRAD_MERGE_OP1)
2391    MAKE_CASE(AArch64ISD::FNEG_MERGE_PASSTHRU)
2392    MAKE_CASE(AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU)
2393    MAKE_CASE(AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU)
2394    MAKE_CASE(AArch64ISD::FCEIL_MERGE_PASSTHRU)
2395    MAKE_CASE(AArch64ISD::FFLOOR_MERGE_PASSTHRU)
2396    MAKE_CASE(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU)
2397    MAKE_CASE(AArch64ISD::FRINT_MERGE_PASSTHRU)
2398    MAKE_CASE(AArch64ISD::FROUND_MERGE_PASSTHRU)
2399    MAKE_CASE(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU)
2400    MAKE_CASE(AArch64ISD::FTRUNC_MERGE_PASSTHRU)
2401    MAKE_CASE(AArch64ISD::FP_ROUND_MERGE_PASSTHRU)
2402    MAKE_CASE(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU)
2403    MAKE_CASE(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU)
2404    MAKE_CASE(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU)
2405    MAKE_CASE(AArch64ISD::FCVTZU_MERGE_PASSTHRU)
2406    MAKE_CASE(AArch64ISD::FCVTZS_MERGE_PASSTHRU)
2407    MAKE_CASE(AArch64ISD::FSQRT_MERGE_PASSTHRU)
2408    MAKE_CASE(AArch64ISD::FRECPX_MERGE_PASSTHRU)
2409    MAKE_CASE(AArch64ISD::FABS_MERGE_PASSTHRU)
2410    MAKE_CASE(AArch64ISD::ABS_MERGE_PASSTHRU)
2411    MAKE_CASE(AArch64ISD::NEG_MERGE_PASSTHRU)
2412    MAKE_CASE(AArch64ISD::SETCC_MERGE_ZERO)
2413    MAKE_CASE(AArch64ISD::ADC)
2414    MAKE_CASE(AArch64ISD::SBC)
2415    MAKE_CASE(AArch64ISD::ADDS)
2416    MAKE_CASE(AArch64ISD::SUBS)
2417    MAKE_CASE(AArch64ISD::ADCS)
2418    MAKE_CASE(AArch64ISD::SBCS)
2419    MAKE_CASE(AArch64ISD::ANDS)
2420    MAKE_CASE(AArch64ISD::CCMP)
2421    MAKE_CASE(AArch64ISD::CCMN)
2422    MAKE_CASE(AArch64ISD::FCCMP)
2423    MAKE_CASE(AArch64ISD::FCMP)
2424    MAKE_CASE(AArch64ISD::STRICT_FCMP)
2425    MAKE_CASE(AArch64ISD::STRICT_FCMPE)
2426    MAKE_CASE(AArch64ISD::SME_ZA_LDR)
2427    MAKE_CASE(AArch64ISD::SME_ZA_STR)
2428    MAKE_CASE(AArch64ISD::DUP)
2429    MAKE_CASE(AArch64ISD::DUPLANE8)
2430    MAKE_CASE(AArch64ISD::DUPLANE16)
2431    MAKE_CASE(AArch64ISD::DUPLANE32)
2432    MAKE_CASE(AArch64ISD::DUPLANE64)
2433    MAKE_CASE(AArch64ISD::DUPLANE128)
2434    MAKE_CASE(AArch64ISD::MOVI)
2435    MAKE_CASE(AArch64ISD::MOVIshift)
2436    MAKE_CASE(AArch64ISD::MOVIedit)
2437    MAKE_CASE(AArch64ISD::MOVImsl)
2438    MAKE_CASE(AArch64ISD::FMOV)
2439    MAKE_CASE(AArch64ISD::MVNIshift)
2440    MAKE_CASE(AArch64ISD::MVNImsl)
2441    MAKE_CASE(AArch64ISD::BICi)
2442    MAKE_CASE(AArch64ISD::ORRi)
2443    MAKE_CASE(AArch64ISD::BSP)
2444    MAKE_CASE(AArch64ISD::ZIP1)
2445    MAKE_CASE(AArch64ISD::ZIP2)
2446    MAKE_CASE(AArch64ISD::UZP1)
2447    MAKE_CASE(AArch64ISD::UZP2)
2448    MAKE_CASE(AArch64ISD::TRN1)
2449    MAKE_CASE(AArch64ISD::TRN2)
2450    MAKE_CASE(AArch64ISD::REV16)
2451    MAKE_CASE(AArch64ISD::REV32)
2452    MAKE_CASE(AArch64ISD::REV64)
2453    MAKE_CASE(AArch64ISD::EXT)
2454    MAKE_CASE(AArch64ISD::SPLICE)
2455    MAKE_CASE(AArch64ISD::VSHL)
2456    MAKE_CASE(AArch64ISD::VLSHR)
2457    MAKE_CASE(AArch64ISD::VASHR)
2458    MAKE_CASE(AArch64ISD::VSLI)
2459    MAKE_CASE(AArch64ISD::VSRI)
2460    MAKE_CASE(AArch64ISD::CMEQ)
2461    MAKE_CASE(AArch64ISD::CMGE)
2462    MAKE_CASE(AArch64ISD::CMGT)
2463    MAKE_CASE(AArch64ISD::CMHI)
2464    MAKE_CASE(AArch64ISD::CMHS)
2465    MAKE_CASE(AArch64ISD::FCMEQ)
2466    MAKE_CASE(AArch64ISD::FCMGE)
2467    MAKE_CASE(AArch64ISD::FCMGT)
2468    MAKE_CASE(AArch64ISD::CMEQz)
2469    MAKE_CASE(AArch64ISD::CMGEz)
2470    MAKE_CASE(AArch64ISD::CMGTz)
2471    MAKE_CASE(AArch64ISD::CMLEz)
2472    MAKE_CASE(AArch64ISD::CMLTz)
2473    MAKE_CASE(AArch64ISD::FCMEQz)
2474    MAKE_CASE(AArch64ISD::FCMGEz)
2475    MAKE_CASE(AArch64ISD::FCMGTz)
2476    MAKE_CASE(AArch64ISD::FCMLEz)
2477    MAKE_CASE(AArch64ISD::FCMLTz)
2478    MAKE_CASE(AArch64ISD::SADDV)
2479    MAKE_CASE(AArch64ISD::UADDV)
2480    MAKE_CASE(AArch64ISD::UADDLV)
2481    MAKE_CASE(AArch64ISD::SADDLV)
2482    MAKE_CASE(AArch64ISD::SDOT)
2483    MAKE_CASE(AArch64ISD::UDOT)
2484    MAKE_CASE(AArch64ISD::SMINV)
2485    MAKE_CASE(AArch64ISD::UMINV)
2486    MAKE_CASE(AArch64ISD::SMAXV)
2487    MAKE_CASE(AArch64ISD::UMAXV)
2488    MAKE_CASE(AArch64ISD::SADDV_PRED)
2489    MAKE_CASE(AArch64ISD::UADDV_PRED)
2490    MAKE_CASE(AArch64ISD::SMAXV_PRED)
2491    MAKE_CASE(AArch64ISD::UMAXV_PRED)
2492    MAKE_CASE(AArch64ISD::SMINV_PRED)
2493    MAKE_CASE(AArch64ISD::UMINV_PRED)
2494    MAKE_CASE(AArch64ISD::ORV_PRED)
2495    MAKE_CASE(AArch64ISD::EORV_PRED)
2496    MAKE_CASE(AArch64ISD::ANDV_PRED)
2497    MAKE_CASE(AArch64ISD::CLASTA_N)
2498    MAKE_CASE(AArch64ISD::CLASTB_N)
2499    MAKE_CASE(AArch64ISD::LASTA)
2500    MAKE_CASE(AArch64ISD::LASTB)
2501    MAKE_CASE(AArch64ISD::REINTERPRET_CAST)
2502    MAKE_CASE(AArch64ISD::LS64_BUILD)
2503    MAKE_CASE(AArch64ISD::LS64_EXTRACT)
2504    MAKE_CASE(AArch64ISD::TBL)
2505    MAKE_CASE(AArch64ISD::FADD_PRED)
2506    MAKE_CASE(AArch64ISD::FADDA_PRED)
2507    MAKE_CASE(AArch64ISD::FADDV_PRED)
2508    MAKE_CASE(AArch64ISD::FDIV_PRED)
2509    MAKE_CASE(AArch64ISD::FMA_PRED)
2510    MAKE_CASE(AArch64ISD::FMAX_PRED)
2511    MAKE_CASE(AArch64ISD::FMAXV_PRED)
2512    MAKE_CASE(AArch64ISD::FMAXNM_PRED)
2513    MAKE_CASE(AArch64ISD::FMAXNMV_PRED)
2514    MAKE_CASE(AArch64ISD::FMIN_PRED)
2515    MAKE_CASE(AArch64ISD::FMINV_PRED)
2516    MAKE_CASE(AArch64ISD::FMINNM_PRED)
2517    MAKE_CASE(AArch64ISD::FMINNMV_PRED)
2518    MAKE_CASE(AArch64ISD::FMUL_PRED)
2519    MAKE_CASE(AArch64ISD::FSUB_PRED)
2520    MAKE_CASE(AArch64ISD::RDSVL)
2521    MAKE_CASE(AArch64ISD::BIC)
2522    MAKE_CASE(AArch64ISD::BIT)
2523    MAKE_CASE(AArch64ISD::CBZ)
2524    MAKE_CASE(AArch64ISD::CBNZ)
2525    MAKE_CASE(AArch64ISD::TBZ)
2526    MAKE_CASE(AArch64ISD::TBNZ)
2527    MAKE_CASE(AArch64ISD::TC_RETURN)
2528    MAKE_CASE(AArch64ISD::PREFETCH)
2529    MAKE_CASE(AArch64ISD::SITOF)
2530    MAKE_CASE(AArch64ISD::UITOF)
2531    MAKE_CASE(AArch64ISD::NVCAST)
2532    MAKE_CASE(AArch64ISD::MRS)
2533    MAKE_CASE(AArch64ISD::SQSHL_I)
2534    MAKE_CASE(AArch64ISD::UQSHL_I)
2535    MAKE_CASE(AArch64ISD::SRSHR_I)
2536    MAKE_CASE(AArch64ISD::URSHR_I)
2537    MAKE_CASE(AArch64ISD::SQSHLU_I)
2538    MAKE_CASE(AArch64ISD::WrapperLarge)
2539    MAKE_CASE(AArch64ISD::LD2post)
2540    MAKE_CASE(AArch64ISD::LD3post)
2541    MAKE_CASE(AArch64ISD::LD4post)
2542    MAKE_CASE(AArch64ISD::ST2post)
2543    MAKE_CASE(AArch64ISD::ST3post)
2544    MAKE_CASE(AArch64ISD::ST4post)
2545    MAKE_CASE(AArch64ISD::LD1x2post)
2546    MAKE_CASE(AArch64ISD::LD1x3post)
2547    MAKE_CASE(AArch64ISD::LD1x4post)
2548    MAKE_CASE(AArch64ISD::ST1x2post)
2549    MAKE_CASE(AArch64ISD::ST1x3post)
2550    MAKE_CASE(AArch64ISD::ST1x4post)
2551    MAKE_CASE(AArch64ISD::LD1DUPpost)
2552    MAKE_CASE(AArch64ISD::LD2DUPpost)
2553    MAKE_CASE(AArch64ISD::LD3DUPpost)
2554    MAKE_CASE(AArch64ISD::LD4DUPpost)
2555    MAKE_CASE(AArch64ISD::LD1LANEpost)
2556    MAKE_CASE(AArch64ISD::LD2LANEpost)
2557    MAKE_CASE(AArch64ISD::LD3LANEpost)
2558    MAKE_CASE(AArch64ISD::LD4LANEpost)
2559    MAKE_CASE(AArch64ISD::ST2LANEpost)
2560    MAKE_CASE(AArch64ISD::ST3LANEpost)
2561    MAKE_CASE(AArch64ISD::ST4LANEpost)
2562    MAKE_CASE(AArch64ISD::SMULL)
2563    MAKE_CASE(AArch64ISD::UMULL)
2564    MAKE_CASE(AArch64ISD::PMULL)
2565    MAKE_CASE(AArch64ISD::FRECPE)
2566    MAKE_CASE(AArch64ISD::FRECPS)
2567    MAKE_CASE(AArch64ISD::FRSQRTE)
2568    MAKE_CASE(AArch64ISD::FRSQRTS)
2569    MAKE_CASE(AArch64ISD::STG)
2570    MAKE_CASE(AArch64ISD::STZG)
2571    MAKE_CASE(AArch64ISD::ST2G)
2572    MAKE_CASE(AArch64ISD::STZ2G)
2573    MAKE_CASE(AArch64ISD::SUNPKHI)
2574    MAKE_CASE(AArch64ISD::SUNPKLO)
2575    MAKE_CASE(AArch64ISD::UUNPKHI)
2576    MAKE_CASE(AArch64ISD::UUNPKLO)
2577    MAKE_CASE(AArch64ISD::INSR)
2578    MAKE_CASE(AArch64ISD::PTEST)
2579    MAKE_CASE(AArch64ISD::PTEST_ANY)
2580    MAKE_CASE(AArch64ISD::PTRUE)
2581    MAKE_CASE(AArch64ISD::LD1_MERGE_ZERO)
2582    MAKE_CASE(AArch64ISD::LD1S_MERGE_ZERO)
2583    MAKE_CASE(AArch64ISD::LDNF1_MERGE_ZERO)
2584    MAKE_CASE(AArch64ISD::LDNF1S_MERGE_ZERO)
2585    MAKE_CASE(AArch64ISD::LDFF1_MERGE_ZERO)
2586    MAKE_CASE(AArch64ISD::LDFF1S_MERGE_ZERO)
2587    MAKE_CASE(AArch64ISD::LD1RQ_MERGE_ZERO)
2588    MAKE_CASE(AArch64ISD::LD1RO_MERGE_ZERO)
2589    MAKE_CASE(AArch64ISD::SVE_LD2_MERGE_ZERO)
2590    MAKE_CASE(AArch64ISD::SVE_LD3_MERGE_ZERO)
2591    MAKE_CASE(AArch64ISD::SVE_LD4_MERGE_ZERO)
2592    MAKE_CASE(AArch64ISD::GLD1_MERGE_ZERO)
2593    MAKE_CASE(AArch64ISD::GLD1_SCALED_MERGE_ZERO)
2594    MAKE_CASE(AArch64ISD::GLD1_SXTW_MERGE_ZERO)
2595    MAKE_CASE(AArch64ISD::GLD1_UXTW_MERGE_ZERO)
2596    MAKE_CASE(AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO)
2597    MAKE_CASE(AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO)
2598    MAKE_CASE(AArch64ISD::GLD1_IMM_MERGE_ZERO)
2599    MAKE_CASE(AArch64ISD::GLD1Q_MERGE_ZERO)
2600    MAKE_CASE(AArch64ISD::GLD1Q_INDEX_MERGE_ZERO)
2601    MAKE_CASE(AArch64ISD::GLD1S_MERGE_ZERO)
2602    MAKE_CASE(AArch64ISD::GLD1S_SCALED_MERGE_ZERO)
2603    MAKE_CASE(AArch64ISD::GLD1S_SXTW_MERGE_ZERO)
2604    MAKE_CASE(AArch64ISD::GLD1S_UXTW_MERGE_ZERO)
2605    MAKE_CASE(AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO)
2606    MAKE_CASE(AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO)
2607    MAKE_CASE(AArch64ISD::GLD1S_IMM_MERGE_ZERO)
2608    MAKE_CASE(AArch64ISD::GLDFF1_MERGE_ZERO)
2609    MAKE_CASE(AArch64ISD::GLDFF1_SCALED_MERGE_ZERO)
2610    MAKE_CASE(AArch64ISD::GLDFF1_SXTW_MERGE_ZERO)
2611    MAKE_CASE(AArch64ISD::GLDFF1_UXTW_MERGE_ZERO)
2612    MAKE_CASE(AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO)
2613    MAKE_CASE(AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO)
2614    MAKE_CASE(AArch64ISD::GLDFF1_IMM_MERGE_ZERO)
2615    MAKE_CASE(AArch64ISD::GLDFF1S_MERGE_ZERO)
2616    MAKE_CASE(AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO)
2617    MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO)
2618    MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO)
2619    MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO)
2620    MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO)
2621    MAKE_CASE(AArch64ISD::GLDFF1S_IMM_MERGE_ZERO)
2622    MAKE_CASE(AArch64ISD::GLDNT1_MERGE_ZERO)
2623    MAKE_CASE(AArch64ISD::GLDNT1_INDEX_MERGE_ZERO)
2624    MAKE_CASE(AArch64ISD::GLDNT1S_MERGE_ZERO)
2625    MAKE_CASE(AArch64ISD::SST1Q_PRED)
2626    MAKE_CASE(AArch64ISD::SST1Q_INDEX_PRED)
2627    MAKE_CASE(AArch64ISD::ST1_PRED)
2628    MAKE_CASE(AArch64ISD::SST1_PRED)
2629    MAKE_CASE(AArch64ISD::SST1_SCALED_PRED)
2630    MAKE_CASE(AArch64ISD::SST1_SXTW_PRED)
2631    MAKE_CASE(AArch64ISD::SST1_UXTW_PRED)
2632    MAKE_CASE(AArch64ISD::SST1_SXTW_SCALED_PRED)
2633    MAKE_CASE(AArch64ISD::SST1_UXTW_SCALED_PRED)
2634    MAKE_CASE(AArch64ISD::SST1_IMM_PRED)
2635    MAKE_CASE(AArch64ISD::SSTNT1_PRED)
2636    MAKE_CASE(AArch64ISD::SSTNT1_INDEX_PRED)
2637    MAKE_CASE(AArch64ISD::LDP)
2638    MAKE_CASE(AArch64ISD::LDIAPP)
2639    MAKE_CASE(AArch64ISD::LDNP)
2640    MAKE_CASE(AArch64ISD::STP)
2641    MAKE_CASE(AArch64ISD::STILP)
2642    MAKE_CASE(AArch64ISD::STNP)
2643    MAKE_CASE(AArch64ISD::BITREVERSE_MERGE_PASSTHRU)
2644    MAKE_CASE(AArch64ISD::BSWAP_MERGE_PASSTHRU)
2645    MAKE_CASE(AArch64ISD::REVH_MERGE_PASSTHRU)
2646    MAKE_CASE(AArch64ISD::REVW_MERGE_PASSTHRU)
2647    MAKE_CASE(AArch64ISD::REVD_MERGE_PASSTHRU)
2648    MAKE_CASE(AArch64ISD::CTLZ_MERGE_PASSTHRU)
2649    MAKE_CASE(AArch64ISD::CTPOP_MERGE_PASSTHRU)
2650    MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU)
2651    MAKE_CASE(AArch64ISD::INDEX_VECTOR)
2652    MAKE_CASE(AArch64ISD::ADDP)
2653    MAKE_CASE(AArch64ISD::SADDLP)
2654    MAKE_CASE(AArch64ISD::UADDLP)
2655    MAKE_CASE(AArch64ISD::CALL_RVMARKER)
2656    MAKE_CASE(AArch64ISD::ASSERT_ZEXT_BOOL)
2657    MAKE_CASE(AArch64ISD::MOPS_MEMSET)
2658    MAKE_CASE(AArch64ISD::MOPS_MEMSET_TAGGING)
2659    MAKE_CASE(AArch64ISD::MOPS_MEMCOPY)
2660    MAKE_CASE(AArch64ISD::MOPS_MEMMOVE)
2661    MAKE_CASE(AArch64ISD::CALL_BTI)
2662    MAKE_CASE(AArch64ISD::MRRS)
2663    MAKE_CASE(AArch64ISD::MSRR)
2664    MAKE_CASE(AArch64ISD::RSHRNB_I)
2665    MAKE_CASE(AArch64ISD::CTTZ_ELTS)
2666    MAKE_CASE(AArch64ISD::CALL_ARM64EC_TO_X64)
2667  }
2668#undef MAKE_CASE
2669  return nullptr;
2670}
2671
2672MachineBasicBlock *
2673AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
2674                                    MachineBasicBlock *MBB) const {
2675  // We materialise the F128CSEL pseudo-instruction as some control flow and a
2676  // phi node:
2677
2678  // OrigBB:
2679  //     [... previous instrs leading to comparison ...]
2680  //     b.ne TrueBB
2681  //     b EndBB
2682  // TrueBB:
2683  //     ; Fallthrough
2684  // EndBB:
2685  //     Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2686
2687  MachineFunction *MF = MBB->getParent();
2688  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2689  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2690  DebugLoc DL = MI.getDebugLoc();
2691  MachineFunction::iterator It = ++MBB->getIterator();
2692
2693  Register DestReg = MI.getOperand(0).getReg();
2694  Register IfTrueReg = MI.getOperand(1).getReg();
2695  Register IfFalseReg = MI.getOperand(2).getReg();
2696  unsigned CondCode = MI.getOperand(3).getImm();
2697  bool NZCVKilled = MI.getOperand(4).isKill();
2698
2699  MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2700  MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2701  MF->insert(It, TrueBB);
2702  MF->insert(It, EndBB);
2703
2704  // Transfer rest of current basic-block to EndBB
2705  EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2706                MBB->end());
2707  EndBB->transferSuccessorsAndUpdatePHIs(MBB);
2708
2709  BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2710  BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2711  MBB->addSuccessor(TrueBB);
2712  MBB->addSuccessor(EndBB);
2713
2714  // TrueBB falls through to the end.
2715  TrueBB->addSuccessor(EndBB);
2716
2717  if (!NZCVKilled) {
2718    TrueBB->addLiveIn(AArch64::NZCV);
2719    EndBB->addLiveIn(AArch64::NZCV);
2720  }
2721
2722  BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2723      .addReg(IfTrueReg)
2724      .addMBB(TrueBB)
2725      .addReg(IfFalseReg)
2726      .addMBB(MBB);
2727
2728  MI.eraseFromParent();
2729  return EndBB;
2730}
2731
2732MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(
2733       MachineInstr &MI, MachineBasicBlock *BB) const {
2734  assert(!isAsynchronousEHPersonality(classifyEHPersonality(
2735             BB->getParent()->getFunction().getPersonalityFn())) &&
2736         "SEH does not use catchret!");
2737  return BB;
2738}
2739
2740MachineBasicBlock *
2741AArch64TargetLowering::EmitDynamicProbedAlloc(MachineInstr &MI,
2742                                              MachineBasicBlock *MBB) const {
2743  MachineFunction &MF = *MBB->getParent();
2744  MachineBasicBlock::iterator MBBI = MI.getIterator();
2745  DebugLoc DL = MBB->findDebugLoc(MBBI);
2746  const AArch64InstrInfo &TII =
2747      *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
2748  Register TargetReg = MI.getOperand(0).getReg();
2749  MachineBasicBlock::iterator NextInst =
2750      TII.probedStackAlloc(MBBI, TargetReg, false);
2751
2752  MI.eraseFromParent();
2753  return NextInst->getParent();
2754}
2755
2756MachineBasicBlock *
2757AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
2758                                    MachineInstr &MI,
2759                                    MachineBasicBlock *BB) const {
2760  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2761  MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2762
2763  MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2764  MIB.add(MI.getOperand(1)); // slice index register
2765  MIB.add(MI.getOperand(2)); // slice index offset
2766  MIB.add(MI.getOperand(3)); // pg
2767  MIB.add(MI.getOperand(4)); // base
2768  MIB.add(MI.getOperand(5)); // offset
2769
2770  MI.eraseFromParent(); // The pseudo is gone now.
2771  return BB;
2772}
2773
2774MachineBasicBlock *
2775AArch64TargetLowering::EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const {
2776  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2777  MachineInstrBuilder MIB =
2778      BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
2779
2780  MIB.addReg(AArch64::ZA, RegState::Define);
2781  MIB.add(MI.getOperand(0)); // Vector select register
2782  MIB.add(MI.getOperand(1)); // Vector select offset
2783  MIB.add(MI.getOperand(2)); // Base
2784  MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
2785
2786  MI.eraseFromParent(); // The pseudo is gone now.
2787  return BB;
2788}
2789
2790MachineBasicBlock *AArch64TargetLowering::EmitZTInstr(MachineInstr &MI,
2791                                                      MachineBasicBlock *BB,
2792                                                      unsigned Opcode,
2793                                                      bool Op0IsDef) const {
2794  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2795  MachineInstrBuilder MIB;
2796
2797  MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode))
2798            .addReg(MI.getOperand(0).getReg(), Op0IsDef ? RegState::Define : 0);
2799  for (unsigned I = 1; I < MI.getNumOperands(); ++I)
2800    MIB.add(MI.getOperand(I));
2801
2802  MI.eraseFromParent(); // The pseudo is gone now.
2803  return BB;
2804}
2805
2806MachineBasicBlock *
2807AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
2808                                   MachineInstr &MI,
2809                                   MachineBasicBlock *BB, bool HasTile) const {
2810  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2811  MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2812  unsigned StartIdx = 0;
2813
2814  if (HasTile) {
2815    MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2816    MIB.addReg(BaseReg + MI.getOperand(0).getImm());
2817    StartIdx = 1;
2818  } else
2819    MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
2820
2821  for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
2822    MIB.add(MI.getOperand(I));
2823
2824  MI.eraseFromParent(); // The pseudo is gone now.
2825  return BB;
2826}
2827
2828MachineBasicBlock *
2829AArch64TargetLowering::EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const {
2830  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2831  MachineInstrBuilder MIB =
2832      BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
2833  MIB.add(MI.getOperand(0)); // Mask
2834
2835  unsigned Mask = MI.getOperand(0).getImm();
2836  for (unsigned I = 0; I < 8; I++) {
2837    if (Mask & (1 << I))
2838      MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
2839  }
2840
2841  MI.eraseFromParent(); // The pseudo is gone now.
2842  return BB;
2843}
2844
2845MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
2846    MachineInstr &MI, MachineBasicBlock *BB) const {
2847
2848  int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode());
2849  if (SMEOrigInstr != -1) {
2850    const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2851    uint64_t SMEMatrixType =
2852        TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
2853    switch (SMEMatrixType) {
2854    case (AArch64::SMEMatrixArray):
2855      return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB, /*HasTile*/ false);
2856    case (AArch64::SMEMatrixTileB):
2857      return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB, /*HasTile*/ true);
2858    case (AArch64::SMEMatrixTileH):
2859      return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB, /*HasTile*/ true);
2860    case (AArch64::SMEMatrixTileS):
2861      return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB, /*HasTile*/ true);
2862    case (AArch64::SMEMatrixTileD):
2863      return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB, /*HasTile*/ true);
2864    case (AArch64::SMEMatrixTileQ):
2865      return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB, /*HasTile*/ true);
2866    }
2867  }
2868
2869  switch (MI.getOpcode()) {
2870  default:
2871#ifndef NDEBUG
2872    MI.dump();
2873#endif
2874    llvm_unreachable("Unexpected instruction for custom inserter!");
2875
2876  case AArch64::F128CSEL:
2877    return EmitF128CSEL(MI, BB);
2878  case TargetOpcode::STATEPOINT:
2879    // STATEPOINT is a pseudo instruction which has no implicit defs/uses
2880    // while bl call instruction (where statepoint will be lowered at the end)
2881    // has implicit def. This def is early-clobber as it will be set at
2882    // the moment of the call and earlier than any use is read.
2883    // Add this implicit dead def here as a workaround.
2884    MI.addOperand(*MI.getMF(),
2885                  MachineOperand::CreateReg(
2886                      AArch64::LR, /*isDef*/ true,
2887                      /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
2888                      /*isUndef*/ false, /*isEarlyClobber*/ true));
2889    [[fallthrough]];
2890  case TargetOpcode::STACKMAP:
2891  case TargetOpcode::PATCHPOINT:
2892    return emitPatchPoint(MI, BB);
2893
2894  case TargetOpcode::PATCHABLE_EVENT_CALL:
2895  case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
2896    return BB;
2897
2898  case AArch64::CATCHRET:
2899    return EmitLoweredCatchRet(MI, BB);
2900
2901  case AArch64::PROBED_STACKALLOC_DYN:
2902    return EmitDynamicProbedAlloc(MI, BB);
2903
2904  case AArch64::LD1_MXIPXX_H_PSEUDO_B:
2905    return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
2906  case AArch64::LD1_MXIPXX_H_PSEUDO_H:
2907    return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
2908  case AArch64::LD1_MXIPXX_H_PSEUDO_S:
2909    return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
2910  case AArch64::LD1_MXIPXX_H_PSEUDO_D:
2911    return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
2912  case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
2913    return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
2914  case AArch64::LD1_MXIPXX_V_PSEUDO_B:
2915    return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
2916  case AArch64::LD1_MXIPXX_V_PSEUDO_H:
2917    return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
2918  case AArch64::LD1_MXIPXX_V_PSEUDO_S:
2919    return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
2920  case AArch64::LD1_MXIPXX_V_PSEUDO_D:
2921    return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
2922  case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
2923    return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
2924  case AArch64::LDR_ZA_PSEUDO:
2925    return EmitFill(MI, BB);
2926  case AArch64::LDR_TX_PSEUDO:
2927    return EmitZTInstr(MI, BB, AArch64::LDR_TX, /*Op0IsDef=*/true);
2928  case AArch64::STR_TX_PSEUDO:
2929    return EmitZTInstr(MI, BB, AArch64::STR_TX, /*Op0IsDef=*/false);
2930  case AArch64::ZERO_M_PSEUDO:
2931    return EmitZero(MI, BB);
2932  case AArch64::ZERO_T_PSEUDO:
2933    return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
2934  }
2935}
2936
2937//===----------------------------------------------------------------------===//
2938// AArch64 Lowering private implementation.
2939//===----------------------------------------------------------------------===//
2940
2941//===----------------------------------------------------------------------===//
2942// Lowering Code
2943//===----------------------------------------------------------------------===//
2944
2945// Forward declarations of SVE fixed length lowering helpers
2946static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT);
2947static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V);
2948static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V);
2949static SDValue convertFixedMaskToScalableVector(SDValue Mask,
2950                                                SelectionDAG &DAG);
2951static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
2952                                             EVT VT);
2953
2954/// isZerosVector - Check whether SDNode N is a zero-filled vector.
2955static bool isZerosVector(const SDNode *N) {
2956  // Look through a bit convert.
2957  while (N->getOpcode() == ISD::BITCAST)
2958    N = N->getOperand(0).getNode();
2959
2960  if (ISD::isConstantSplatVectorAllZeros(N))
2961    return true;
2962
2963  if (N->getOpcode() != AArch64ISD::DUP)
2964    return false;
2965
2966  auto Opnd0 = N->getOperand(0);
2967  return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);
2968}
2969
2970/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
2971/// CC
2972static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
2973  switch (CC) {
2974  default:
2975    llvm_unreachable("Unknown condition code!");
2976  case ISD::SETNE:
2977    return AArch64CC::NE;
2978  case ISD::SETEQ:
2979    return AArch64CC::EQ;
2980  case ISD::SETGT:
2981    return AArch64CC::GT;
2982  case ISD::SETGE:
2983    return AArch64CC::GE;
2984  case ISD::SETLT:
2985    return AArch64CC::LT;
2986  case ISD::SETLE:
2987    return AArch64CC::LE;
2988  case ISD::SETUGT:
2989    return AArch64CC::HI;
2990  case ISD::SETUGE:
2991    return AArch64CC::HS;
2992  case ISD::SETULT:
2993    return AArch64CC::LO;
2994  case ISD::SETULE:
2995    return AArch64CC::LS;
2996  }
2997}
2998
2999/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
3000static void changeFPCCToAArch64CC(ISD::CondCode CC,
3001                                  AArch64CC::CondCode &CondCode,
3002                                  AArch64CC::CondCode &CondCode2) {
3003  CondCode2 = AArch64CC::AL;
3004  switch (CC) {
3005  default:
3006    llvm_unreachable("Unknown FP condition!");
3007  case ISD::SETEQ:
3008  case ISD::SETOEQ:
3009    CondCode = AArch64CC::EQ;
3010    break;
3011  case ISD::SETGT:
3012  case ISD::SETOGT:
3013    CondCode = AArch64CC::GT;
3014    break;
3015  case ISD::SETGE:
3016  case ISD::SETOGE:
3017    CondCode = AArch64CC::GE;
3018    break;
3019  case ISD::SETOLT:
3020    CondCode = AArch64CC::MI;
3021    break;
3022  case ISD::SETOLE:
3023    CondCode = AArch64CC::LS;
3024    break;
3025  case ISD::SETONE:
3026    CondCode = AArch64CC::MI;
3027    CondCode2 = AArch64CC::GT;
3028    break;
3029  case ISD::SETO:
3030    CondCode = AArch64CC::VC;
3031    break;
3032  case ISD::SETUO:
3033    CondCode = AArch64CC::VS;
3034    break;
3035  case ISD::SETUEQ:
3036    CondCode = AArch64CC::EQ;
3037    CondCode2 = AArch64CC::VS;
3038    break;
3039  case ISD::SETUGT:
3040    CondCode = AArch64CC::HI;
3041    break;
3042  case ISD::SETUGE:
3043    CondCode = AArch64CC::PL;
3044    break;
3045  case ISD::SETLT:
3046  case ISD::SETULT:
3047    CondCode = AArch64CC::LT;
3048    break;
3049  case ISD::SETLE:
3050  case ISD::SETULE:
3051    CondCode = AArch64CC::LE;
3052    break;
3053  case ISD::SETNE:
3054  case ISD::SETUNE:
3055    CondCode = AArch64CC::NE;
3056    break;
3057  }
3058}
3059
3060/// Convert a DAG fp condition code to an AArch64 CC.
3061/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
3062/// should be AND'ed instead of OR'ed.
3063static void changeFPCCToANDAArch64CC(ISD::CondCode CC,
3064                                     AArch64CC::CondCode &CondCode,
3065                                     AArch64CC::CondCode &CondCode2) {
3066  CondCode2 = AArch64CC::AL;
3067  switch (CC) {
3068  default:
3069    changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3070    assert(CondCode2 == AArch64CC::AL);
3071    break;
3072  case ISD::SETONE:
3073    // (a one b)
3074    // == ((a olt b) || (a ogt b))
3075    // == ((a ord b) && (a une b))
3076    CondCode = AArch64CC::VC;
3077    CondCode2 = AArch64CC::NE;
3078    break;
3079  case ISD::SETUEQ:
3080    // (a ueq b)
3081    // == ((a uno b) || (a oeq b))
3082    // == ((a ule b) && (a uge b))
3083    CondCode = AArch64CC::PL;
3084    CondCode2 = AArch64CC::LE;
3085    break;
3086  }
3087}
3088
3089/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
3090/// CC usable with the vector instructions. Fewer operations are available
3091/// without a real NZCV register, so we have to use less efficient combinations
3092/// to get the same effect.
3093static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
3094                                        AArch64CC::CondCode &CondCode,
3095                                        AArch64CC::CondCode &CondCode2,
3096                                        bool &Invert) {
3097  Invert = false;
3098  switch (CC) {
3099  default:
3100    // Mostly the scalar mappings work fine.
3101    changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3102    break;
3103  case ISD::SETUO:
3104    Invert = true;
3105    [[fallthrough]];
3106  case ISD::SETO:
3107    CondCode = AArch64CC::MI;
3108    CondCode2 = AArch64CC::GE;
3109    break;
3110  case ISD::SETUEQ:
3111  case ISD::SETULT:
3112  case ISD::SETULE:
3113  case ISD::SETUGT:
3114  case ISD::SETUGE:
3115    // All of the compare-mask comparisons are ordered, but we can switch
3116    // between the two by a double inversion. E.g. ULE == !OGT.
3117    Invert = true;
3118    changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
3119                          CondCode, CondCode2);
3120    break;
3121  }
3122}
3123
3124static bool isLegalArithImmed(uint64_t C) {
3125  // Matches AArch64DAGToDAGISel::SelectArithImmed().
3126  bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
3127  LLVM_DEBUG(dbgs() << "Is imm " << C
3128                    << " legal: " << (IsLegal ? "yes\n" : "no\n"));
3129  return IsLegal;
3130}
3131
3132// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
3133// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
3134// can be set differently by this operation. It comes down to whether
3135// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
3136// everything is fine. If not then the optimization is wrong. Thus general
3137// comparisons are only valid if op2 != 0.
3138//
3139// So, finally, the only LLVM-native comparisons that don't mention C and V
3140// are SETEQ and SETNE. They're the only ones we can safely use CMN for in
3141// the absence of information about op2.
3142static bool isCMN(SDValue Op, ISD::CondCode CC) {
3143  return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
3144         (CC == ISD::SETEQ || CC == ISD::SETNE);
3145}
3146
3147static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl,
3148                                      SelectionDAG &DAG, SDValue Chain,
3149                                      bool IsSignaling) {
3150  EVT VT = LHS.getValueType();
3151  assert(VT != MVT::f128);
3152
3153  const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3154
3155  if (VT == MVT::f16 && !FullFP16) {
3156    LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3157                      {Chain, LHS});
3158    RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3159                      {LHS.getValue(1), RHS});
3160    Chain = RHS.getValue(1);
3161    VT = MVT::f32;
3162  }
3163  unsigned Opcode =
3164      IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
3165  return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
3166}
3167
3168static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
3169                              const SDLoc &dl, SelectionDAG &DAG) {
3170  EVT VT = LHS.getValueType();
3171  const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3172
3173  if (VT.isFloatingPoint()) {
3174    assert(VT != MVT::f128);
3175    if (VT == MVT::f16 && !FullFP16) {
3176      LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
3177      RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
3178      VT = MVT::f32;
3179    }
3180    return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
3181  }
3182
3183  // The CMP instruction is just an alias for SUBS, and representing it as
3184  // SUBS means that it's possible to get CSE with subtract operations.
3185  // A later phase can perform the optimization of setting the destination
3186  // register to WZR/XZR if it ends up being unused.
3187  unsigned Opcode = AArch64ISD::SUBS;
3188
3189  if (isCMN(RHS, CC)) {
3190    // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
3191    Opcode = AArch64ISD::ADDS;
3192    RHS = RHS.getOperand(1);
3193  } else if (isCMN(LHS, CC)) {
3194    // As we are looking for EQ/NE compares, the operands can be commuted ; can
3195    // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
3196    Opcode = AArch64ISD::ADDS;
3197    LHS = LHS.getOperand(1);
3198  } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
3199    if (LHS.getOpcode() == ISD::AND) {
3200      // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
3201      // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
3202      // of the signed comparisons.
3203      const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
3204                                           DAG.getVTList(VT, MVT_CC),
3205                                           LHS.getOperand(0),
3206                                           LHS.getOperand(1));
3207      // Replace all users of (and X, Y) with newly generated (ands X, Y)
3208      DAG.ReplaceAllUsesWith(LHS, ANDSNode);
3209      return ANDSNode.getValue(1);
3210    } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
3211      // Use result of ANDS
3212      return LHS.getValue(1);
3213    }
3214  }
3215
3216  return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
3217      .getValue(1);
3218}
3219
3220/// \defgroup AArch64CCMP CMP;CCMP matching
3221///
3222/// These functions deal with the formation of CMP;CCMP;... sequences.
3223/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
3224/// a comparison. They set the NZCV flags to a predefined value if their
3225/// predicate is false. This allows to express arbitrary conjunctions, for
3226/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
3227/// expressed as:
3228///   cmp A
3229///   ccmp B, inv(CB), CA
3230///   check for CB flags
3231///
3232/// This naturally lets us implement chains of AND operations with SETCC
3233/// operands. And we can even implement some other situations by transforming
3234/// them:
3235///   - We can implement (NEG SETCC) i.e. negating a single comparison by
3236///     negating the flags used in a CCMP/FCCMP operations.
3237///   - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
3238///     by negating the flags we test for afterwards. i.e.
3239///     NEG (CMP CCMP CCCMP ...) can be implemented.
3240///   - Note that we can only ever negate all previously processed results.
3241///     What we can not implement by flipping the flags to test is a negation
3242///     of two sub-trees (because the negation affects all sub-trees emitted so
3243///     far, so the 2nd sub-tree we emit would also affect the first).
3244/// With those tools we can implement some OR operations:
3245///   - (OR (SETCC A) (SETCC B)) can be implemented via:
3246///     NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3247///   - After transforming OR to NEG/AND combinations we may be able to use NEG
3248///     elimination rules from earlier to implement the whole thing as a
3249///     CCMP/FCCMP chain.
3250///
3251/// As complete example:
3252///     or (or (setCA (cmp A)) (setCB (cmp B)))
3253///        (and (setCC (cmp C)) (setCD (cmp D)))"
3254/// can be reassociated to:
3255///     or (and (setCC (cmp C)) setCD (cmp D))
3256//         (or (setCA (cmp A)) (setCB (cmp B)))
3257/// can be transformed to:
3258///     not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3259///              (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3260/// which can be implemented as:
3261///   cmp C
3262///   ccmp D, inv(CD), CC
3263///   ccmp A, CA, inv(CD)
3264///   ccmp B, CB, inv(CA)
3265///   check for CB flags
3266///
3267/// A counterexample is "or (and A B) (and C D)" which translates to
3268/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3269/// can only implement 1 of the inner (not) operations, but not both!
3270/// @{
3271
3272/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3273static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
3274                                         ISD::CondCode CC, SDValue CCOp,
3275                                         AArch64CC::CondCode Predicate,
3276                                         AArch64CC::CondCode OutCC,
3277                                         const SDLoc &DL, SelectionDAG &DAG) {
3278  unsigned Opcode = 0;
3279  const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3280
3281  if (LHS.getValueType().isFloatingPoint()) {
3282    assert(LHS.getValueType() != MVT::f128);
3283    if (LHS.getValueType() == MVT::f16 && !FullFP16) {
3284      LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3285      RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3286    }
3287    Opcode = AArch64ISD::FCCMP;
3288  } else if (RHS.getOpcode() == ISD::SUB) {
3289    SDValue SubOp0 = RHS.getOperand(0);
3290    if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
3291      // See emitComparison() on why we can only do this for SETEQ and SETNE.
3292      Opcode = AArch64ISD::CCMN;
3293      RHS = RHS.getOperand(1);
3294    }
3295  }
3296  if (Opcode == 0)
3297    Opcode = AArch64ISD::CCMP;
3298
3299  SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
3300  AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
3301  unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
3302  SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
3303  return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
3304}
3305
3306/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3307/// expressed as a conjunction. See \ref AArch64CCMP.
3308/// \param CanNegate    Set to true if we can negate the whole sub-tree just by
3309///                     changing the conditions on the SETCC tests.
3310///                     (this means we can call emitConjunctionRec() with
3311///                      Negate==true on this sub-tree)
3312/// \param MustBeFirst  Set to true if this subtree needs to be negated and we
3313///                     cannot do the negation naturally. We are required to
3314///                     emit the subtree first in this case.
3315/// \param WillNegate   Is true if are called when the result of this
3316///                     subexpression must be negated. This happens when the
3317///                     outer expression is an OR. We can use this fact to know
3318///                     that we have a double negation (or (or ...) ...) that
3319///                     can be implemented for free.
3320static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
3321                               bool &MustBeFirst, bool WillNegate,
3322                               unsigned Depth = 0) {
3323  if (!Val.hasOneUse())
3324    return false;
3325  unsigned Opcode = Val->getOpcode();
3326  if (Opcode == ISD::SETCC) {
3327    if (Val->getOperand(0).getValueType() == MVT::f128)
3328      return false;
3329    CanNegate = true;
3330    MustBeFirst = false;
3331    return true;
3332  }
3333  // Protect against exponential runtime and stack overflow.
3334  if (Depth > 6)
3335    return false;
3336  if (Opcode == ISD::AND || Opcode == ISD::OR) {
3337    bool IsOR = Opcode == ISD::OR;
3338    SDValue O0 = Val->getOperand(0);
3339    SDValue O1 = Val->getOperand(1);
3340    bool CanNegateL;
3341    bool MustBeFirstL;
3342    if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
3343      return false;
3344    bool CanNegateR;
3345    bool MustBeFirstR;
3346    if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
3347      return false;
3348
3349    if (MustBeFirstL && MustBeFirstR)
3350      return false;
3351
3352    if (IsOR) {
3353      // For an OR expression we need to be able to naturally negate at least
3354      // one side or we cannot do the transformation at all.
3355      if (!CanNegateL && !CanNegateR)
3356        return false;
3357      // If we the result of the OR will be negated and we can naturally negate
3358      // the leafs, then this sub-tree as a whole negates naturally.
3359      CanNegate = WillNegate && CanNegateL && CanNegateR;
3360      // If we cannot naturally negate the whole sub-tree, then this must be
3361      // emitted first.
3362      MustBeFirst = !CanNegate;
3363    } else {
3364      assert(Opcode == ISD::AND && "Must be OR or AND");
3365      // We cannot naturally negate an AND operation.
3366      CanNegate = false;
3367      MustBeFirst = MustBeFirstL || MustBeFirstR;
3368    }
3369    return true;
3370  }
3371  return false;
3372}
3373
3374/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3375/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
3376/// Tries to transform the given i1 producing node @p Val to a series compare
3377/// and conditional compare operations. @returns an NZCV flags producing node
3378/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
3379/// transformation was not possible.
3380/// \p Negate is true if we want this sub-tree being negated just by changing
3381/// SETCC conditions.
3382static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val,
3383    AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
3384    AArch64CC::CondCode Predicate) {
3385  // We're at a tree leaf, produce a conditional comparison operation.
3386  unsigned Opcode = Val->getOpcode();
3387  if (Opcode == ISD::SETCC) {
3388    SDValue LHS = Val->getOperand(0);
3389    SDValue RHS = Val->getOperand(1);
3390    ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
3391    bool isInteger = LHS.getValueType().isInteger();
3392    if (Negate)
3393      CC = getSetCCInverse(CC, LHS.getValueType());
3394    SDLoc DL(Val);
3395    // Determine OutCC and handle FP special case.
3396    if (isInteger) {
3397      OutCC = changeIntCCToAArch64CC(CC);
3398    } else {
3399      assert(LHS.getValueType().isFloatingPoint());
3400      AArch64CC::CondCode ExtraCC;
3401      changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
3402      // Some floating point conditions can't be tested with a single condition
3403      // code. Construct an additional comparison in this case.
3404      if (ExtraCC != AArch64CC::AL) {
3405        SDValue ExtraCmp;
3406        if (!CCOp.getNode())
3407          ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
3408        else
3409          ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
3410                                               ExtraCC, DL, DAG);
3411        CCOp = ExtraCmp;
3412        Predicate = ExtraCC;
3413      }
3414    }
3415
3416    // Produce a normal comparison if we are first in the chain
3417    if (!CCOp)
3418      return emitComparison(LHS, RHS, CC, DL, DAG);
3419    // Otherwise produce a ccmp.
3420    return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
3421                                     DAG);
3422  }
3423  assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
3424
3425  bool IsOR = Opcode == ISD::OR;
3426
3427  SDValue LHS = Val->getOperand(0);
3428  bool CanNegateL;
3429  bool MustBeFirstL;
3430  bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
3431  assert(ValidL && "Valid conjunction/disjunction tree");
3432  (void)ValidL;
3433
3434  SDValue RHS = Val->getOperand(1);
3435  bool CanNegateR;
3436  bool MustBeFirstR;
3437  bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
3438  assert(ValidR && "Valid conjunction/disjunction tree");
3439  (void)ValidR;
3440
3441  // Swap sub-tree that must come first to the right side.
3442  if (MustBeFirstL) {
3443    assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
3444    std::swap(LHS, RHS);
3445    std::swap(CanNegateL, CanNegateR);
3446    std::swap(MustBeFirstL, MustBeFirstR);
3447  }
3448
3449  bool NegateR;
3450  bool NegateAfterR;
3451  bool NegateL;
3452  bool NegateAfterAll;
3453  if (Opcode == ISD::OR) {
3454    // Swap the sub-tree that we can negate naturally to the left.
3455    if (!CanNegateL) {
3456      assert(CanNegateR && "at least one side must be negatable");
3457      assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
3458      assert(!Negate);
3459      std::swap(LHS, RHS);
3460      NegateR = false;
3461      NegateAfterR = true;
3462    } else {
3463      // Negate the left sub-tree if possible, otherwise negate the result.
3464      NegateR = CanNegateR;
3465      NegateAfterR = !CanNegateR;
3466    }
3467    NegateL = true;
3468    NegateAfterAll = !Negate;
3469  } else {
3470    assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
3471    assert(!Negate && "Valid conjunction/disjunction tree");
3472
3473    NegateL = false;
3474    NegateR = false;
3475    NegateAfterR = false;
3476    NegateAfterAll = false;
3477  }
3478
3479  // Emit sub-trees.
3480  AArch64CC::CondCode RHSCC;
3481  SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
3482  if (NegateAfterR)
3483    RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
3484  SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
3485  if (NegateAfterAll)
3486    OutCC = AArch64CC::getInvertedCondCode(OutCC);
3487  return CmpL;
3488}
3489
3490/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
3491/// In some cases this is even possible with OR operations in the expression.
3492/// See \ref AArch64CCMP.
3493/// \see emitConjunctionRec().
3494static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val,
3495                               AArch64CC::CondCode &OutCC) {
3496  bool DummyCanNegate;
3497  bool DummyMustBeFirst;
3498  if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
3499    return SDValue();
3500
3501  return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
3502}
3503
3504/// @}
3505
3506/// Returns how profitable it is to fold a comparison's operand's shift and/or
3507/// extension operations.
3508static unsigned getCmpOperandFoldingProfit(SDValue Op) {
3509  auto isSupportedExtend = [&](SDValue V) {
3510    if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
3511      return true;
3512
3513    if (V.getOpcode() == ISD::AND)
3514      if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
3515        uint64_t Mask = MaskCst->getZExtValue();
3516        return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
3517      }
3518
3519    return false;
3520  };
3521
3522  if (!Op.hasOneUse())
3523    return 0;
3524
3525  if (isSupportedExtend(Op))
3526    return 1;
3527
3528  unsigned Opc = Op.getOpcode();
3529  if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
3530    if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
3531      uint64_t Shift = ShiftCst->getZExtValue();
3532      if (isSupportedExtend(Op.getOperand(0)))
3533        return (Shift <= 4) ? 2 : 1;
3534      EVT VT = Op.getValueType();
3535      if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
3536        return 1;
3537    }
3538
3539  return 0;
3540}
3541
3542static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
3543                             SDValue &AArch64cc, SelectionDAG &DAG,
3544                             const SDLoc &dl) {
3545  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
3546    EVT VT = RHS.getValueType();
3547    uint64_t C = RHSC->getZExtValue();
3548    if (!isLegalArithImmed(C)) {
3549      // Constant does not fit, try adjusting it by one?
3550      switch (CC) {
3551      default:
3552        break;
3553      case ISD::SETLT:
3554      case ISD::SETGE:
3555        if ((VT == MVT::i32 && C != 0x80000000 &&
3556             isLegalArithImmed((uint32_t)(C - 1))) ||
3557            (VT == MVT::i64 && C != 0x80000000ULL &&
3558             isLegalArithImmed(C - 1ULL))) {
3559          CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
3560          C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3561          RHS = DAG.getConstant(C, dl, VT);
3562        }
3563        break;
3564      case ISD::SETULT:
3565      case ISD::SETUGE:
3566        if ((VT == MVT::i32 && C != 0 &&
3567             isLegalArithImmed((uint32_t)(C - 1))) ||
3568            (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
3569          CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
3570          C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3571          RHS = DAG.getConstant(C, dl, VT);
3572        }
3573        break;
3574      case ISD::SETLE:
3575      case ISD::SETGT:
3576        if ((VT == MVT::i32 && C != INT32_MAX &&
3577             isLegalArithImmed((uint32_t)(C + 1))) ||
3578            (VT == MVT::i64 && C != INT64_MAX &&
3579             isLegalArithImmed(C + 1ULL))) {
3580          CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
3581          C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3582          RHS = DAG.getConstant(C, dl, VT);
3583        }
3584        break;
3585      case ISD::SETULE:
3586      case ISD::SETUGT:
3587        if ((VT == MVT::i32 && C != UINT32_MAX &&
3588             isLegalArithImmed((uint32_t)(C + 1))) ||
3589            (VT == MVT::i64 && C != UINT64_MAX &&
3590             isLegalArithImmed(C + 1ULL))) {
3591          CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
3592          C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3593          RHS = DAG.getConstant(C, dl, VT);
3594        }
3595        break;
3596      }
3597    }
3598  }
3599
3600  // Comparisons are canonicalized so that the RHS operand is simpler than the
3601  // LHS one, the extreme case being when RHS is an immediate. However, AArch64
3602  // can fold some shift+extend operations on the RHS operand, so swap the
3603  // operands if that can be done.
3604  //
3605  // For example:
3606  //    lsl     w13, w11, #1
3607  //    cmp     w13, w12
3608  // can be turned into:
3609  //    cmp     w12, w11, lsl #1
3610  if (!isa<ConstantSDNode>(RHS) || !isLegalArithImmed(RHS->getAsZExtVal())) {
3611    SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
3612
3613    if (getCmpOperandFoldingProfit(TheLHS) > getCmpOperandFoldingProfit(RHS)) {
3614      std::swap(LHS, RHS);
3615      CC = ISD::getSetCCSwappedOperands(CC);
3616    }
3617  }
3618
3619  SDValue Cmp;
3620  AArch64CC::CondCode AArch64CC;
3621  if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
3622    const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
3623
3624    // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
3625    // For the i8 operand, the largest immediate is 255, so this can be easily
3626    // encoded in the compare instruction. For the i16 operand, however, the
3627    // largest immediate cannot be encoded in the compare.
3628    // Therefore, use a sign extending load and cmn to avoid materializing the
3629    // -1 constant. For example,
3630    // movz w1, #65535
3631    // ldrh w0, [x0, #0]
3632    // cmp w0, w1
3633    // >
3634    // ldrsh w0, [x0, #0]
3635    // cmn w0, #1
3636    // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
3637    // if and only if (sext LHS) == (sext RHS). The checks are in place to
3638    // ensure both the LHS and RHS are truly zero extended and to make sure the
3639    // transformation is profitable.
3640    if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
3641        cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
3642        cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
3643        LHS.getNode()->hasNUsesOfValue(1, 0)) {
3644      int16_t ValueofRHS = RHS->getAsZExtVal();
3645      if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
3646        SDValue SExt =
3647            DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
3648                        DAG.getValueType(MVT::i16));
3649        Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
3650                                                   RHS.getValueType()),
3651                             CC, dl, DAG);
3652        AArch64CC = changeIntCCToAArch64CC(CC);
3653      }
3654    }
3655
3656    if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
3657      if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
3658        if ((CC == ISD::SETNE) ^ RHSC->isZero())
3659          AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
3660      }
3661    }
3662  }
3663
3664  if (!Cmp) {
3665    Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
3666    AArch64CC = changeIntCCToAArch64CC(CC);
3667  }
3668  AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
3669  return Cmp;
3670}
3671
3672static std::pair<SDValue, SDValue>
3673getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
3674  assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
3675         "Unsupported value type");
3676  SDValue Value, Overflow;
3677  SDLoc DL(Op);
3678  SDValue LHS = Op.getOperand(0);
3679  SDValue RHS = Op.getOperand(1);
3680  unsigned Opc = 0;
3681  switch (Op.getOpcode()) {
3682  default:
3683    llvm_unreachable("Unknown overflow instruction!");
3684  case ISD::SADDO:
3685    Opc = AArch64ISD::ADDS;
3686    CC = AArch64CC::VS;
3687    break;
3688  case ISD::UADDO:
3689    Opc = AArch64ISD::ADDS;
3690    CC = AArch64CC::HS;
3691    break;
3692  case ISD::SSUBO:
3693    Opc = AArch64ISD::SUBS;
3694    CC = AArch64CC::VS;
3695    break;
3696  case ISD::USUBO:
3697    Opc = AArch64ISD::SUBS;
3698    CC = AArch64CC::LO;
3699    break;
3700  // Multiply needs a little bit extra work.
3701  case ISD::SMULO:
3702  case ISD::UMULO: {
3703    CC = AArch64CC::NE;
3704    bool IsSigned = Op.getOpcode() == ISD::SMULO;
3705    if (Op.getValueType() == MVT::i32) {
3706      // Extend to 64-bits, then perform a 64-bit multiply.
3707      unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3708      LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
3709      RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
3710      SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3711      Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
3712
3713      // Check that the result fits into a 32-bit integer.
3714      SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC);
3715      if (IsSigned) {
3716        // cmp xreg, wreg, sxtw
3717        SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
3718        Overflow =
3719            DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
3720      } else {
3721        // tst xreg, #0xffffffff00000000
3722        SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
3723        Overflow =
3724            DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
3725      }
3726      break;
3727    }
3728    assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
3729    // For the 64 bit multiply
3730    Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3731    if (IsSigned) {
3732      SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
3733      SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
3734                                      DAG.getConstant(63, DL, MVT::i64));
3735      // It is important that LowerBits is last, otherwise the arithmetic
3736      // shift will not be folded into the compare (SUBS).
3737      SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3738      Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
3739                     .getValue(1);
3740    } else {
3741      SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
3742      SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3743      Overflow =
3744          DAG.getNode(AArch64ISD::SUBS, DL, VTs,
3745                      DAG.getConstant(0, DL, MVT::i64),
3746                      UpperBits).getValue(1);
3747    }
3748    break;
3749  }
3750  } // switch (...)
3751
3752  if (Opc) {
3753    SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
3754
3755    // Emit the AArch64 operation with overflow check.
3756    Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
3757    Overflow = Value.getValue(1);
3758  }
3759  return std::make_pair(Value, Overflow);
3760}
3761
3762SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
3763  if (useSVEForFixedLengthVectorVT(Op.getValueType(),
3764                                   !Subtarget->isNeonAvailable()))
3765    return LowerToScalableOp(Op, DAG);
3766
3767  SDValue Sel = Op.getOperand(0);
3768  SDValue Other = Op.getOperand(1);
3769  SDLoc dl(Sel);
3770
3771  // If the operand is an overflow checking operation, invert the condition
3772  // code and kill the Not operation. I.e., transform:
3773  // (xor (overflow_op_bool, 1))
3774  //   -->
3775  // (csel 1, 0, invert(cc), overflow_op_bool)
3776  // ... which later gets transformed to just a cset instruction with an
3777  // inverted condition code, rather than a cset + eor sequence.
3778  if (isOneConstant(Other) && ISD::isOverflowIntrOpRes(Sel)) {
3779    // Only lower legal XALUO ops.
3780    if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))
3781      return SDValue();
3782
3783    SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3784    SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3785    AArch64CC::CondCode CC;
3786    SDValue Value, Overflow;
3787    std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
3788    SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3789    return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
3790                       CCVal, Overflow);
3791  }
3792  // If neither operand is a SELECT_CC, give up.
3793  if (Sel.getOpcode() != ISD::SELECT_CC)
3794    std::swap(Sel, Other);
3795  if (Sel.getOpcode() != ISD::SELECT_CC)
3796    return Op;
3797
3798  // The folding we want to perform is:
3799  // (xor x, (select_cc a, b, cc, 0, -1) )
3800  //   -->
3801  // (csel x, (xor x, -1), cc ...)
3802  //
3803  // The latter will get matched to a CSINV instruction.
3804
3805  ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
3806  SDValue LHS = Sel.getOperand(0);
3807  SDValue RHS = Sel.getOperand(1);
3808  SDValue TVal = Sel.getOperand(2);
3809  SDValue FVal = Sel.getOperand(3);
3810
3811  // FIXME: This could be generalized to non-integer comparisons.
3812  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
3813    return Op;
3814
3815  ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
3816  ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
3817
3818  // The values aren't constants, this isn't the pattern we're looking for.
3819  if (!CFVal || !CTVal)
3820    return Op;
3821
3822  // We can commute the SELECT_CC by inverting the condition.  This
3823  // might be needed to make this fit into a CSINV pattern.
3824  if (CTVal->isAllOnes() && CFVal->isZero()) {
3825    std::swap(TVal, FVal);
3826    std::swap(CTVal, CFVal);
3827    CC = ISD::getSetCCInverse(CC, LHS.getValueType());
3828  }
3829
3830  // If the constants line up, perform the transform!
3831  if (CTVal->isZero() && CFVal->isAllOnes()) {
3832    SDValue CCVal;
3833    SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
3834
3835    FVal = Other;
3836    TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
3837                       DAG.getConstant(-1ULL, dl, Other.getValueType()));
3838
3839    return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
3840                       CCVal, Cmp);
3841  }
3842
3843  return Op;
3844}
3845
3846// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
3847// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
3848// sets 'C' bit to 0.
3849static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert) {
3850  SDLoc DL(Value);
3851  EVT VT = Value.getValueType();
3852  SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
3853  SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
3854  SDValue Cmp =
3855      DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::Glue), Op0, Op1);
3856  return Cmp.getValue(1);
3857}
3858
3859// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
3860// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
3861static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG,
3862                                bool Invert) {
3863  assert(Glue.getResNo() == 1);
3864  SDLoc DL(Glue);
3865  SDValue Zero = DAG.getConstant(0, DL, VT);
3866  SDValue One = DAG.getConstant(1, DL, VT);
3867  unsigned Cond = Invert ? AArch64CC::LO : AArch64CC::HS;
3868  SDValue CC = DAG.getConstant(Cond, DL, MVT::i32);
3869  return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
3870}
3871
3872// Value is 1 if 'V' bit of NZCV is 1, else 0
3873static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG) {
3874  assert(Glue.getResNo() == 1);
3875  SDLoc DL(Glue);
3876  SDValue Zero = DAG.getConstant(0, DL, VT);
3877  SDValue One = DAG.getConstant(1, DL, VT);
3878  SDValue CC = DAG.getConstant(AArch64CC::VS, DL, MVT::i32);
3879  return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
3880}
3881
3882// This lowering is inefficient, but it will get cleaned up by
3883// `foldOverflowCheck`
3884static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG,
3885                                  unsigned Opcode, bool IsSigned) {
3886  EVT VT0 = Op.getValue(0).getValueType();
3887  EVT VT1 = Op.getValue(1).getValueType();
3888
3889  if (VT0 != MVT::i32 && VT0 != MVT::i64)
3890    return SDValue();
3891
3892  bool InvertCarry = Opcode == AArch64ISD::SBCS;
3893  SDValue OpLHS = Op.getOperand(0);
3894  SDValue OpRHS = Op.getOperand(1);
3895  SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
3896
3897  SDLoc DL(Op);
3898  SDVTList VTs = DAG.getVTList(VT0, VT1);
3899
3900  SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::Glue), OpLHS,
3901                            OpRHS, OpCarryIn);
3902
3903  SDValue OutFlag =
3904      IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
3905               : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
3906
3907  return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Sum, OutFlag);
3908}
3909
3910static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
3911  // Let legalize expand this if it isn't a legal type yet.
3912  if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
3913    return SDValue();
3914
3915  SDLoc dl(Op);
3916  AArch64CC::CondCode CC;
3917  // The actual operation that sets the overflow or carry flag.
3918  SDValue Value, Overflow;
3919  std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
3920
3921  // We use 0 and 1 as false and true values.
3922  SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3923  SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3924
3925  // We use an inverted condition, because the conditional select is inverted
3926  // too. This will allow it to be selected to a single instruction:
3927  // CSINC Wd, WZR, WZR, invert(cond).
3928  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3929  Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
3930                         CCVal, Overflow);
3931
3932  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
3933  return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
3934}
3935
3936// Prefetch operands are:
3937// 1: Address to prefetch
3938// 2: bool isWrite
3939// 3: int locality (0 = no locality ... 3 = extreme locality)
3940// 4: bool isDataCache
3941static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
3942  SDLoc DL(Op);
3943  unsigned IsWrite = Op.getConstantOperandVal(2);
3944  unsigned Locality = Op.getConstantOperandVal(3);
3945  unsigned IsData = Op.getConstantOperandVal(4);
3946
3947  bool IsStream = !Locality;
3948  // When the locality number is set
3949  if (Locality) {
3950    // The front-end should have filtered out the out-of-range values
3951    assert(Locality <= 3 && "Prefetch locality out-of-range");
3952    // The locality degree is the opposite of the cache speed.
3953    // Put the number the other way around.
3954    // The encoding starts at 0 for level 1
3955    Locality = 3 - Locality;
3956  }
3957
3958  // built the mask value encoding the expected behavior.
3959  unsigned PrfOp = (IsWrite << 4) |     // Load/Store bit
3960                   (!IsData << 3) |     // IsDataCache bit
3961                   (Locality << 1) |    // Cache level bits
3962                   (unsigned)IsStream;  // Stream bit
3963  return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
3964                     DAG.getTargetConstant(PrfOp, DL, MVT::i32),
3965                     Op.getOperand(1));
3966}
3967
3968SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
3969                                              SelectionDAG &DAG) const {
3970  EVT VT = Op.getValueType();
3971  if (VT.isScalableVector())
3972    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
3973
3974  if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
3975    return LowerFixedLengthFPExtendToSVE(Op, DAG);
3976
3977  assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
3978  return SDValue();
3979}
3980
3981SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
3982                                             SelectionDAG &DAG) const {
3983  if (Op.getValueType().isScalableVector())
3984    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
3985
3986  bool IsStrict = Op->isStrictFPOpcode();
3987  SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
3988  EVT SrcVT = SrcVal.getValueType();
3989
3990  if (useSVEForFixedLengthVectorVT(SrcVT, !Subtarget->isNeonAvailable()))
3991    return LowerFixedLengthFPRoundToSVE(Op, DAG);
3992
3993  if (SrcVT != MVT::f128) {
3994    // Expand cases where the input is a vector bigger than NEON.
3995    if (useSVEForFixedLengthVectorVT(SrcVT))
3996      return SDValue();
3997
3998    // It's legal except when f128 is involved
3999    return Op;
4000  }
4001
4002  return SDValue();
4003}
4004
4005SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
4006                                                    SelectionDAG &DAG) const {
4007  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4008  // Any additional optimization in this function should be recorded
4009  // in the cost tables.
4010  bool IsStrict = Op->isStrictFPOpcode();
4011  EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();
4012  EVT VT = Op.getValueType();
4013
4014  if (VT.isScalableVector()) {
4015    unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
4016                          ? AArch64ISD::FCVTZU_MERGE_PASSTHRU
4017                          : AArch64ISD::FCVTZS_MERGE_PASSTHRU;
4018    return LowerToPredicatedOp(Op, DAG, Opcode);
4019  }
4020
4021  if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4022      useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4023    return LowerFixedLengthFPToIntToSVE(Op, DAG);
4024
4025  unsigned NumElts = InVT.getVectorNumElements();
4026
4027  // f16 conversions are promoted to f32 when full fp16 is not supported.
4028  if (InVT.getVectorElementType() == MVT::f16 &&
4029      !Subtarget->hasFullFP16()) {
4030    MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
4031    SDLoc dl(Op);
4032    if (IsStrict) {
4033      SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NewVT, MVT::Other},
4034                                {Op.getOperand(0), Op.getOperand(1)});
4035      return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
4036                         {Ext.getValue(1), Ext.getValue(0)});
4037    }
4038    return DAG.getNode(
4039        Op.getOpcode(), dl, Op.getValueType(),
4040        DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
4041  }
4042
4043  uint64_t VTSize = VT.getFixedSizeInBits();
4044  uint64_t InVTSize = InVT.getFixedSizeInBits();
4045  if (VTSize < InVTSize) {
4046    SDLoc dl(Op);
4047    if (IsStrict) {
4048      InVT = InVT.changeVectorElementTypeToInteger();
4049      SDValue Cv = DAG.getNode(Op.getOpcode(), dl, {InVT, MVT::Other},
4050                               {Op.getOperand(0), Op.getOperand(1)});
4051      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
4052      return DAG.getMergeValues({Trunc, Cv.getValue(1)}, dl);
4053    }
4054    SDValue Cv =
4055        DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
4056                    Op.getOperand(0));
4057    return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
4058  }
4059
4060  if (VTSize > InVTSize) {
4061    SDLoc dl(Op);
4062    MVT ExtVT =
4063        MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
4064                         VT.getVectorNumElements());
4065    if (IsStrict) {
4066      SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {ExtVT, MVT::Other},
4067                                {Op.getOperand(0), Op.getOperand(1)});
4068      return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
4069                         {Ext.getValue(1), Ext.getValue(0)});
4070    }
4071    SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
4072    return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
4073  }
4074
4075  // Use a scalar operation for conversions between single-element vectors of
4076  // the same size.
4077  if (NumElts == 1) {
4078    SDLoc dl(Op);
4079    SDValue Extract = DAG.getNode(
4080        ISD::EXTRACT_VECTOR_ELT, dl, InVT.getScalarType(),
4081        Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, dl, MVT::i64));
4082    EVT ScalarVT = VT.getScalarType();
4083    if (IsStrict)
4084      return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
4085                         {Op.getOperand(0), Extract});
4086    return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
4087  }
4088
4089  // Type changing conversions are illegal.
4090  return Op;
4091}
4092
4093SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
4094                                              SelectionDAG &DAG) const {
4095  bool IsStrict = Op->isStrictFPOpcode();
4096  SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4097
4098  if (SrcVal.getValueType().isVector())
4099    return LowerVectorFP_TO_INT(Op, DAG);
4100
4101  // f16 conversions are promoted to f32 when full fp16 is not supported.
4102  if (SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
4103    SDLoc dl(Op);
4104    if (IsStrict) {
4105      SDValue Ext =
4106          DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
4107                      {Op.getOperand(0), SrcVal});
4108      return DAG.getNode(Op.getOpcode(), dl, {Op.getValueType(), MVT::Other},
4109                         {Ext.getValue(1), Ext.getValue(0)});
4110    }
4111    return DAG.getNode(
4112        Op.getOpcode(), dl, Op.getValueType(),
4113        DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal));
4114  }
4115
4116  if (SrcVal.getValueType() != MVT::f128) {
4117    // It's legal except when f128 is involved
4118    return Op;
4119  }
4120
4121  return SDValue();
4122}
4123
4124SDValue
4125AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
4126                                                SelectionDAG &DAG) const {
4127  // AArch64 FP-to-int conversions saturate to the destination element size, so
4128  // we can lower common saturating conversions to simple instructions.
4129  SDValue SrcVal = Op.getOperand(0);
4130  EVT SrcVT = SrcVal.getValueType();
4131  EVT DstVT = Op.getValueType();
4132  EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4133
4134  uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
4135  uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
4136  uint64_t SatWidth = SatVT.getScalarSizeInBits();
4137  assert(SatWidth <= DstElementWidth &&
4138         "Saturation width cannot exceed result width");
4139
4140  // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
4141  // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
4142  // types, so this is hard to reach.
4143  if (DstVT.isScalableVector())
4144    return SDValue();
4145
4146  EVT SrcElementVT = SrcVT.getVectorElementType();
4147
4148  // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4149  if (SrcElementVT == MVT::f16 &&
4150      (!Subtarget->hasFullFP16() || DstElementWidth > 16)) {
4151    MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
4152    SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), F32VT, SrcVal);
4153    SrcVT = F32VT;
4154    SrcElementVT = MVT::f32;
4155    SrcElementWidth = 32;
4156  } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
4157             SrcElementVT != MVT::f16)
4158    return SDValue();
4159
4160  SDLoc DL(Op);
4161  // Cases that we can emit directly.
4162  if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth)
4163    return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4164                       DAG.getValueType(DstVT.getScalarType()));
4165
4166  // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4167  // result. This is only valid if the legal cvt is larger than the saturate
4168  // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
4169  // (at least until sqxtn is selected).
4170  if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)
4171    return SDValue();
4172
4173  EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
4174  SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
4175                                  DAG.getValueType(IntVT.getScalarType()));
4176  SDValue Sat;
4177  if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4178    SDValue MinC = DAG.getConstant(
4179        APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4180    SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
4181    SDValue MaxC = DAG.getConstant(
4182        APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4183    Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
4184  } else {
4185    SDValue MinC = DAG.getConstant(
4186        APInt::getAllOnes(SatWidth).zext(SrcElementWidth), DL, IntVT);
4187    Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
4188  }
4189
4190  return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4191}
4192
4193SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
4194                                                  SelectionDAG &DAG) const {
4195  // AArch64 FP-to-int conversions saturate to the destination register size, so
4196  // we can lower common saturating conversions to simple instructions.
4197  SDValue SrcVal = Op.getOperand(0);
4198  EVT SrcVT = SrcVal.getValueType();
4199
4200  if (SrcVT.isVector())
4201    return LowerVectorFP_TO_INT_SAT(Op, DAG);
4202
4203  EVT DstVT = Op.getValueType();
4204  EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4205  uint64_t SatWidth = SatVT.getScalarSizeInBits();
4206  uint64_t DstWidth = DstVT.getScalarSizeInBits();
4207  assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
4208
4209  // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4210  if (SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) {
4211    SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal);
4212    SrcVT = MVT::f32;
4213  } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16)
4214    return SDValue();
4215
4216  SDLoc DL(Op);
4217  // Cases that we can emit directly.
4218  if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
4219       (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
4220      DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))
4221    return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4222                       DAG.getValueType(DstVT));
4223
4224  // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4225  // result. This is only valid if the legal cvt is larger than the saturate
4226  // width.
4227  if (DstWidth < SatWidth)
4228    return SDValue();
4229
4230  SDValue NativeCvt =
4231      DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));
4232  SDValue Sat;
4233  if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4234    SDValue MinC = DAG.getConstant(
4235        APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT);
4236    SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC);
4237    SDValue MaxC = DAG.getConstant(
4238        APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);
4239    Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
4240  } else {
4241    SDValue MinC = DAG.getConstant(
4242        APInt::getAllOnes(SatWidth).zext(DstWidth), DL, DstVT);
4243    Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
4244  }
4245
4246  return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4247}
4248
4249SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
4250                                                    SelectionDAG &DAG) const {
4251  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4252  // Any additional optimization in this function should be recorded
4253  // in the cost tables.
4254  bool IsStrict = Op->isStrictFPOpcode();
4255  EVT VT = Op.getValueType();
4256  SDLoc dl(Op);
4257  SDValue In = Op.getOperand(IsStrict ? 1 : 0);
4258  EVT InVT = In.getValueType();
4259  unsigned Opc = Op.getOpcode();
4260  bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
4261
4262  if (VT.isScalableVector()) {
4263    if (InVT.getVectorElementType() == MVT::i1) {
4264      // We can't directly extend an SVE predicate; extend it first.
4265      unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4266      EVT CastVT = getPromotedVTForPredicate(InVT);
4267      In = DAG.getNode(CastOpc, dl, CastVT, In);
4268      return DAG.getNode(Opc, dl, VT, In);
4269    }
4270
4271    unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
4272                               : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
4273    return LowerToPredicatedOp(Op, DAG, Opcode);
4274  }
4275
4276  if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4277      useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4278    return LowerFixedLengthIntToFPToSVE(Op, DAG);
4279
4280  uint64_t VTSize = VT.getFixedSizeInBits();
4281  uint64_t InVTSize = InVT.getFixedSizeInBits();
4282  if (VTSize < InVTSize) {
4283    MVT CastVT =
4284        MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
4285                         InVT.getVectorNumElements());
4286    if (IsStrict) {
4287      In = DAG.getNode(Opc, dl, {CastVT, MVT::Other},
4288                       {Op.getOperand(0), In});
4289      return DAG.getNode(
4290          ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
4291          {In.getValue(1), In.getValue(0), DAG.getIntPtrConstant(0, dl)});
4292    }
4293    In = DAG.getNode(Opc, dl, CastVT, In);
4294    return DAG.getNode(ISD::FP_ROUND, dl, VT, In,
4295                       DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
4296  }
4297
4298  if (VTSize > InVTSize) {
4299    unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4300    EVT CastVT = VT.changeVectorElementTypeToInteger();
4301    In = DAG.getNode(CastOpc, dl, CastVT, In);
4302    if (IsStrict)
4303      return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op.getOperand(0), In});
4304    return DAG.getNode(Opc, dl, VT, In);
4305  }
4306
4307  // Use a scalar operation for conversions between single-element vectors of
4308  // the same size.
4309  if (VT.getVectorNumElements() == 1) {
4310    SDValue Extract = DAG.getNode(
4311        ISD::EXTRACT_VECTOR_ELT, dl, InVT.getScalarType(),
4312        In, DAG.getConstant(0, dl, MVT::i64));
4313    EVT ScalarVT = VT.getScalarType();
4314    if (IsStrict)
4315      return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
4316                         {Op.getOperand(0), Extract});
4317    return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
4318  }
4319
4320  return Op;
4321}
4322
4323SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
4324                                            SelectionDAG &DAG) const {
4325  if (Op.getValueType().isVector())
4326    return LowerVectorINT_TO_FP(Op, DAG);
4327
4328  bool IsStrict = Op->isStrictFPOpcode();
4329  SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4330
4331  // f16 conversions are promoted to f32 when full fp16 is not supported.
4332  if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
4333    SDLoc dl(Op);
4334    if (IsStrict) {
4335      SDValue Val = DAG.getNode(Op.getOpcode(), dl, {MVT::f32, MVT::Other},
4336                                {Op.getOperand(0), SrcVal});
4337      return DAG.getNode(
4338          ISD::STRICT_FP_ROUND, dl, {MVT::f16, MVT::Other},
4339          {Val.getValue(1), Val.getValue(0), DAG.getIntPtrConstant(0, dl)});
4340    }
4341    return DAG.getNode(
4342        ISD::FP_ROUND, dl, MVT::f16,
4343        DAG.getNode(Op.getOpcode(), dl, MVT::f32, SrcVal),
4344        DAG.getIntPtrConstant(0, dl));
4345  }
4346
4347  // i128 conversions are libcalls.
4348  if (SrcVal.getValueType() == MVT::i128)
4349    return SDValue();
4350
4351  // Other conversions are legal, unless it's to the completely software-based
4352  // fp128.
4353  if (Op.getValueType() != MVT::f128)
4354    return Op;
4355  return SDValue();
4356}
4357
4358SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
4359                                            SelectionDAG &DAG) const {
4360  // For iOS, we want to call an alternative entry point: __sincos_stret,
4361  // which returns the values in two S / D registers.
4362  SDLoc dl(Op);
4363  SDValue Arg = Op.getOperand(0);
4364  EVT ArgVT = Arg.getValueType();
4365  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
4366
4367  ArgListTy Args;
4368  ArgListEntry Entry;
4369
4370  Entry.Node = Arg;
4371  Entry.Ty = ArgTy;
4372  Entry.IsSExt = false;
4373  Entry.IsZExt = false;
4374  Args.push_back(Entry);
4375
4376  RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
4377                                        : RTLIB::SINCOS_STRET_F32;
4378  const char *LibcallName = getLibcallName(LC);
4379  SDValue Callee =
4380      DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
4381
4382  StructType *RetTy = StructType::get(ArgTy, ArgTy);
4383  TargetLowering::CallLoweringInfo CLI(DAG);
4384  CLI.setDebugLoc(dl)
4385      .setChain(DAG.getEntryNode())
4386      .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
4387
4388  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
4389  return CallResult.first;
4390}
4391
4392static MVT getSVEContainerType(EVT ContentTy);
4393
4394SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
4395                                            SelectionDAG &DAG) const {
4396  EVT OpVT = Op.getValueType();
4397  EVT ArgVT = Op.getOperand(0).getValueType();
4398
4399  if (useSVEForFixedLengthVectorVT(OpVT))
4400    return LowerFixedLengthBitcastToSVE(Op, DAG);
4401
4402  if (OpVT.isScalableVector()) {
4403    // Bitcasting between unpacked vector types of different element counts is
4404    // not a NOP because the live elements are laid out differently.
4405    //                01234567
4406    // e.g. nxv2i32 = XX??XX??
4407    //      nxv4f16 = X?X?X?X?
4408    if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
4409      return SDValue();
4410
4411    if (isTypeLegal(OpVT) && !isTypeLegal(ArgVT)) {
4412      assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
4413             "Expected int->fp bitcast!");
4414      SDValue ExtResult =
4415          DAG.getNode(ISD::ANY_EXTEND, SDLoc(Op), getSVEContainerType(ArgVT),
4416                      Op.getOperand(0));
4417      return getSVESafeBitCast(OpVT, ExtResult, DAG);
4418    }
4419    return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
4420  }
4421
4422  if (OpVT != MVT::f16 && OpVT != MVT::bf16)
4423    return SDValue();
4424
4425  // Bitcasts between f16 and bf16 are legal.
4426  if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)
4427    return Op;
4428
4429  assert(ArgVT == MVT::i16);
4430  SDLoc DL(Op);
4431
4432  Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
4433  Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
4434  return DAG.getTargetExtractSubreg(AArch64::hsub, DL, OpVT, Op);
4435}
4436
4437static EVT getExtensionTo64Bits(const EVT &OrigVT) {
4438  if (OrigVT.getSizeInBits() >= 64)
4439    return OrigVT;
4440
4441  assert(OrigVT.isSimple() && "Expecting a simple value type");
4442
4443  MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
4444  switch (OrigSimpleTy) {
4445  default: llvm_unreachable("Unexpected Vector Type");
4446  case MVT::v2i8:
4447  case MVT::v2i16:
4448     return MVT::v2i32;
4449  case MVT::v4i8:
4450    return  MVT::v4i16;
4451  }
4452}
4453
4454static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG,
4455                                                 const EVT &OrigTy,
4456                                                 const EVT &ExtTy,
4457                                                 unsigned ExtOpcode) {
4458  // The vector originally had a size of OrigTy. It was then extended to ExtTy.
4459  // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
4460  // 64-bits we need to insert a new extension so that it will be 64-bits.
4461  assert(ExtTy.is128BitVector() && "Unexpected extension size");
4462  if (OrigTy.getSizeInBits() >= 64)
4463    return N;
4464
4465  // Must extend size to at least 64 bits to be used as an operand for VMULL.
4466  EVT NewVT = getExtensionTo64Bits(OrigTy);
4467
4468  return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
4469}
4470
4471// Returns lane if Op extracts from a two-element vector and lane is constant
4472// (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.
4473static std::optional<uint64_t>
4474getConstantLaneNumOfExtractHalfOperand(SDValue &Op) {
4475  SDNode *OpNode = Op.getNode();
4476  if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
4477    return std::nullopt;
4478
4479  EVT VT = OpNode->getOperand(0).getValueType();
4480  ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpNode->getOperand(1));
4481  if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C)
4482    return std::nullopt;
4483
4484  return C->getZExtValue();
4485}
4486
4487static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG,
4488                                   bool isSigned) {
4489  EVT VT = N.getValueType();
4490
4491  if (N.getOpcode() != ISD::BUILD_VECTOR)
4492    return false;
4493
4494  for (const SDValue &Elt : N->op_values()) {
4495    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
4496      unsigned EltSize = VT.getScalarSizeInBits();
4497      unsigned HalfSize = EltSize / 2;
4498      if (isSigned) {
4499        if (!isIntN(HalfSize, C->getSExtValue()))
4500          return false;
4501      } else {
4502        if (!isUIntN(HalfSize, C->getZExtValue()))
4503          return false;
4504      }
4505      continue;
4506    }
4507    return false;
4508  }
4509
4510  return true;
4511}
4512
4513static SDValue skipExtensionForVectorMULL(SDValue N, SelectionDAG &DAG) {
4514  EVT VT = N.getValueType();
4515  assert(VT.is128BitVector() && "Unexpected vector MULL size");
4516
4517  unsigned NumElts = VT.getVectorNumElements();
4518  unsigned OrigEltSize = VT.getScalarSizeInBits();
4519  unsigned EltSize = OrigEltSize / 2;
4520  MVT TruncVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
4521
4522  APInt HiBits = APInt::getHighBitsSet(OrigEltSize, EltSize);
4523  if (DAG.MaskedValueIsZero(N, HiBits))
4524    return DAG.getNode(ISD::TRUNCATE, SDLoc(N), TruncVT, N);
4525
4526  if (ISD::isExtOpcode(N.getOpcode()))
4527    return addRequiredExtensionForVectorMULL(N.getOperand(0), DAG,
4528                                             N.getOperand(0).getValueType(), VT,
4529                                             N.getOpcode());
4530
4531  assert(N.getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
4532  SDLoc dl(N);
4533  SmallVector<SDValue, 8> Ops;
4534  for (unsigned i = 0; i != NumElts; ++i) {
4535    const APInt &CInt = N.getConstantOperandAPInt(i);
4536    // Element types smaller than 32 bits are not legal, so use i32 elements.
4537    // The values are implicitly truncated so sext vs. zext doesn't matter.
4538    Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
4539  }
4540  return DAG.getBuildVector(TruncVT, dl, Ops);
4541}
4542
4543static bool isSignExtended(SDValue N, SelectionDAG &DAG) {
4544  return N.getOpcode() == ISD::SIGN_EXTEND ||
4545         N.getOpcode() == ISD::ANY_EXTEND ||
4546         isExtendedBUILD_VECTOR(N, DAG, true);
4547}
4548
4549static bool isZeroExtended(SDValue N, SelectionDAG &DAG) {
4550  return N.getOpcode() == ISD::ZERO_EXTEND ||
4551         N.getOpcode() == ISD::ANY_EXTEND ||
4552         isExtendedBUILD_VECTOR(N, DAG, false);
4553}
4554
4555static bool isAddSubSExt(SDValue N, SelectionDAG &DAG) {
4556  unsigned Opcode = N.getOpcode();
4557  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
4558    SDValue N0 = N.getOperand(0);
4559    SDValue N1 = N.getOperand(1);
4560    return N0->hasOneUse() && N1->hasOneUse() &&
4561      isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
4562  }
4563  return false;
4564}
4565
4566static bool isAddSubZExt(SDValue N, SelectionDAG &DAG) {
4567  unsigned Opcode = N.getOpcode();
4568  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
4569    SDValue N0 = N.getOperand(0);
4570    SDValue N1 = N.getOperand(1);
4571    return N0->hasOneUse() && N1->hasOneUse() &&
4572      isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
4573  }
4574  return false;
4575}
4576
4577SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,
4578                                                 SelectionDAG &DAG) const {
4579  // The rounding mode is in bits 23:22 of the FPSCR.
4580  // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
4581  // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
4582  // so that the shift + and get folded into a bitfield extract.
4583  SDLoc dl(Op);
4584
4585  SDValue Chain = Op.getOperand(0);
4586  SDValue FPCR_64 = DAG.getNode(
4587      ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other},
4588      {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)});
4589  Chain = FPCR_64.getValue(1);
4590  SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
4591  SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
4592                                  DAG.getConstant(1U << 22, dl, MVT::i32));
4593  SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
4594                              DAG.getConstant(22, dl, MVT::i32));
4595  SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
4596                            DAG.getConstant(3, dl, MVT::i32));
4597  return DAG.getMergeValues({AND, Chain}, dl);
4598}
4599
4600SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
4601                                                 SelectionDAG &DAG) const {
4602  SDLoc DL(Op);
4603  SDValue Chain = Op->getOperand(0);
4604  SDValue RMValue = Op->getOperand(1);
4605
4606  // The rounding mode is in bits 23:22 of the FPCR.
4607  // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
4608  // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
4609  // ((arg - 1) & 3) << 22).
4610  //
4611  // The argument of llvm.set.rounding must be within the segment [0, 3], so
4612  // NearestTiesToAway (4) is not handled here. It is responsibility of the code
4613  // generated llvm.set.rounding to ensure this condition.
4614
4615  // Calculate new value of FPCR[23:22].
4616  RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
4617                        DAG.getConstant(1, DL, MVT::i32));
4618  RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
4619                        DAG.getConstant(0x3, DL, MVT::i32));
4620  RMValue =
4621      DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
4622                  DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
4623  RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
4624
4625  // Get current value of FPCR.
4626  SDValue Ops[] = {
4627      Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
4628  SDValue FPCR =
4629      DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
4630  Chain = FPCR.getValue(1);
4631  FPCR = FPCR.getValue(0);
4632
4633  // Put new rounding mode into FPSCR[23:22].
4634  const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
4635  FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
4636                     DAG.getConstant(RMMask, DL, MVT::i64));
4637  FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
4638  SDValue Ops2[] = {
4639      Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
4640      FPCR};
4641  return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
4642}
4643
4644static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
4645                                 SDLoc DL, bool &IsMLA) {
4646  bool IsN0SExt = isSignExtended(N0, DAG);
4647  bool IsN1SExt = isSignExtended(N1, DAG);
4648  if (IsN0SExt && IsN1SExt)
4649    return AArch64ISD::SMULL;
4650
4651  bool IsN0ZExt = isZeroExtended(N0, DAG);
4652  bool IsN1ZExt = isZeroExtended(N1, DAG);
4653
4654  if (IsN0ZExt && IsN1ZExt)
4655    return AArch64ISD::UMULL;
4656
4657  // Select SMULL if we can replace zext with sext.
4658  if (((IsN0SExt && IsN1ZExt) || (IsN0ZExt && IsN1SExt)) &&
4659      !isExtendedBUILD_VECTOR(N0, DAG, false) &&
4660      !isExtendedBUILD_VECTOR(N1, DAG, false)) {
4661    SDValue ZextOperand;
4662    if (IsN0ZExt)
4663      ZextOperand = N0.getOperand(0);
4664    else
4665      ZextOperand = N1.getOperand(0);
4666    if (DAG.SignBitIsZero(ZextOperand)) {
4667      SDValue NewSext =
4668          DAG.getSExtOrTrunc(ZextOperand, DL, N0.getValueType());
4669      if (IsN0ZExt)
4670        N0 = NewSext;
4671      else
4672        N1 = NewSext;
4673      return AArch64ISD::SMULL;
4674    }
4675  }
4676
4677  // Select UMULL if we can replace the other operand with an extend.
4678  if (IsN0ZExt || IsN1ZExt) {
4679    EVT VT = N0.getValueType();
4680    APInt Mask = APInt::getHighBitsSet(VT.getScalarSizeInBits(),
4681                                       VT.getScalarSizeInBits() / 2);
4682    if (DAG.MaskedValueIsZero(IsN0ZExt ? N1 : N0, Mask))
4683      return AArch64ISD::UMULL;
4684  }
4685
4686  if (!IsN1SExt && !IsN1ZExt)
4687    return 0;
4688
4689  // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
4690  // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
4691  if (IsN1SExt && isAddSubSExt(N0, DAG)) {
4692    IsMLA = true;
4693    return AArch64ISD::SMULL;
4694  }
4695  if (IsN1ZExt && isAddSubZExt(N0, DAG)) {
4696    IsMLA = true;
4697    return AArch64ISD::UMULL;
4698  }
4699  if (IsN0ZExt && isAddSubZExt(N1, DAG)) {
4700    std::swap(N0, N1);
4701    IsMLA = true;
4702    return AArch64ISD::UMULL;
4703  }
4704  return 0;
4705}
4706
4707SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
4708  EVT VT = Op.getValueType();
4709
4710  bool OverrideNEON = !Subtarget->isNeonAvailable();
4711  if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
4712    return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
4713
4714  // Multiplications are only custom-lowered for 128-bit and 64-bit vectors so
4715  // that VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
4716  assert((VT.is128BitVector() || VT.is64BitVector()) && VT.isInteger() &&
4717         "unexpected type for custom-lowering ISD::MUL");
4718  SDValue N0 = Op.getOperand(0);
4719  SDValue N1 = Op.getOperand(1);
4720  bool isMLA = false;
4721  EVT OVT = VT;
4722  if (VT.is64BitVector()) {
4723    if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4724        isNullConstant(N0.getOperand(1)) &&
4725        N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4726        isNullConstant(N1.getOperand(1))) {
4727      N0 = N0.getOperand(0);
4728      N1 = N1.getOperand(0);
4729      VT = N0.getValueType();
4730    } else {
4731      if (VT == MVT::v1i64) {
4732        if (Subtarget->hasSVE())
4733          return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
4734        // Fall through to expand this.  It is not legal.
4735        return SDValue();
4736      } else
4737        // Other vector multiplications are legal.
4738        return Op;
4739    }
4740  }
4741
4742  SDLoc DL(Op);
4743  unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, isMLA);
4744
4745  if (!NewOpc) {
4746    if (VT.getVectorElementType() == MVT::i64) {
4747      // If SVE is available then i64 vector multiplications can also be made
4748      // legal.
4749      if (Subtarget->hasSVE())
4750        return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
4751      // Fall through to expand this.  It is not legal.
4752      return SDValue();
4753    } else
4754      // Other vector multiplications are legal.
4755      return Op;
4756  }
4757
4758  // Legalize to a S/UMULL instruction
4759  SDValue Op0;
4760  SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
4761  if (!isMLA) {
4762    Op0 = skipExtensionForVectorMULL(N0, DAG);
4763    assert(Op0.getValueType().is64BitVector() &&
4764           Op1.getValueType().is64BitVector() &&
4765           "unexpected types for extended operands to VMULL");
4766    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OVT,
4767                       DAG.getNode(NewOpc, DL, VT, Op0, Op1),
4768                       DAG.getConstant(0, DL, MVT::i64));
4769  }
4770  // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
4771  // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
4772  // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
4773  SDValue N00 = skipExtensionForVectorMULL(N0.getOperand(0), DAG);
4774  SDValue N01 = skipExtensionForVectorMULL(N0.getOperand(1), DAG);
4775  EVT Op1VT = Op1.getValueType();
4776  return DAG.getNode(
4777      ISD::EXTRACT_SUBVECTOR, DL, OVT,
4778      DAG.getNode(N0.getOpcode(), DL, VT,
4779                  DAG.getNode(NewOpc, DL, VT,
4780                              DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
4781                  DAG.getNode(NewOpc, DL, VT,
4782                              DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)),
4783      DAG.getConstant(0, DL, MVT::i64));
4784}
4785
4786static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
4787                               int Pattern) {
4788  if (VT == MVT::nxv1i1 && Pattern == AArch64SVEPredPattern::all)
4789    return DAG.getConstant(1, DL, MVT::nxv1i1);
4790  return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
4791                     DAG.getTargetConstant(Pattern, DL, MVT::i32));
4792}
4793
4794static SDValue optimizeWhile(SDValue Op, SelectionDAG &DAG, bool IsSigned,
4795                             bool IsLess, bool IsEqual) {
4796  if (!isa<ConstantSDNode>(Op.getOperand(1)) ||
4797      !isa<ConstantSDNode>(Op.getOperand(2)))
4798    return SDValue();
4799
4800  SDLoc dl(Op);
4801  APInt X = Op.getConstantOperandAPInt(1);
4802  APInt Y = Op.getConstantOperandAPInt(2);
4803  APInt NumActiveElems;
4804  bool Overflow;
4805  if (IsLess)
4806    NumActiveElems = IsSigned ? Y.ssub_ov(X, Overflow) : Y.usub_ov(X, Overflow);
4807  else
4808    NumActiveElems = IsSigned ? X.ssub_ov(Y, Overflow) : X.usub_ov(Y, Overflow);
4809
4810  if (Overflow)
4811    return SDValue();
4812
4813  if (IsEqual) {
4814    APInt One(NumActiveElems.getBitWidth(), 1, IsSigned);
4815    NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(One, Overflow)
4816                              : NumActiveElems.uadd_ov(One, Overflow);
4817    if (Overflow)
4818      return SDValue();
4819  }
4820
4821  std::optional<unsigned> PredPattern =
4822      getSVEPredPatternFromNumElements(NumActiveElems.getZExtValue());
4823  unsigned MinSVEVectorSize = std::max(
4824      DAG.getSubtarget<AArch64Subtarget>().getMinSVEVectorSizeInBits(), 128u);
4825  unsigned ElementSize = 128 / Op.getValueType().getVectorMinNumElements();
4826  if (PredPattern != std::nullopt &&
4827      NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize))
4828    return getPTrue(DAG, dl, Op.getValueType(), *PredPattern);
4829
4830  return SDValue();
4831}
4832
4833// Returns a safe bitcast between two scalable vector predicates, where
4834// any newly created lanes from a widening bitcast are defined as zero.
4835static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) {
4836  SDLoc DL(Op);
4837  EVT InVT = Op.getValueType();
4838
4839  assert(InVT.getVectorElementType() == MVT::i1 &&
4840         VT.getVectorElementType() == MVT::i1 &&
4841         "Expected a predicate-to-predicate bitcast");
4842  assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
4843         InVT.isScalableVector() &&
4844         DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
4845         "Only expect to cast between legal scalable predicate types!");
4846
4847  // Return the operand if the cast isn't changing type,
4848  // e.g. <n x 16 x i1> -> <n x 16 x i1>
4849  if (InVT == VT)
4850    return Op;
4851
4852  SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
4853
4854  // We only have to zero the lanes if new lanes are being defined, e.g. when
4855  // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
4856  // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
4857  // we can return here.
4858  if (InVT.bitsGT(VT))
4859    return Reinterpret;
4860
4861  // Check if the other lanes are already known to be zeroed by
4862  // construction.
4863  if (isZeroingInactiveLanes(Op))
4864    return Reinterpret;
4865
4866  // Zero the newly introduced lanes.
4867  SDValue Mask = DAG.getConstant(1, DL, InVT);
4868  Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask);
4869  return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask);
4870}
4871
4872SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
4873                                                  SDValue Chain, SDLoc DL,
4874                                                  EVT VT) const {
4875  SDValue Callee = DAG.getExternalSymbol("__arm_sme_state",
4876                                         getPointerTy(DAG.getDataLayout()));
4877  Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());
4878  Type *RetTy = StructType::get(Int64Ty, Int64Ty);
4879  TargetLowering::CallLoweringInfo CLI(DAG);
4880  ArgListTy Args;
4881  CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
4882      CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2,
4883      RetTy, Callee, std::move(Args));
4884  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
4885  SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
4886  return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
4887                     Mask);
4888}
4889
4890// Lower an SME LDR/STR ZA intrinsic
4891// Case 1: If the vector number (vecnum) is an immediate in range, it gets
4892// folded into the instruction
4893//    ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
4894// Case 2: If the vecnum is not an immediate, then it is used to modify the base
4895// and tile slice registers
4896//    ldr(%tileslice, %ptr, %vecnum)
4897//    ->
4898//    %svl = rdsvl
4899//    %ptr2 = %ptr + %svl * %vecnum
4900//    %tileslice2 = %tileslice + %vecnum
4901//    ldr [%tileslice2, 0], [%ptr2, 0]
4902// Case 3: If the vecnum is an immediate out of range, then the same is done as
4903// case 2, but the base and slice registers are modified by the greatest
4904// multiple of 15 lower than the vecnum and the remainder is folded into the
4905// instruction. This means that successive loads and stores that are offset from
4906// each other can share the same base and slice register updates.
4907//    ldr(%tileslice, %ptr, 22)
4908//    ldr(%tileslice, %ptr, 23)
4909//    ->
4910//    %svl = rdsvl
4911//    %ptr2 = %ptr + %svl * 15
4912//    %tileslice2 = %tileslice + 15
4913//    ldr [%tileslice2, 7], [%ptr2, 7]
4914//    ldr [%tileslice2, 8], [%ptr2, 8]
4915// Case 4: If the vecnum is an add of an immediate, then the non-immediate
4916// operand and the immediate can be folded into the instruction, like case 2.
4917//    ldr(%tileslice, %ptr, %vecnum + 7)
4918//    ldr(%tileslice, %ptr, %vecnum + 8)
4919//    ->
4920//    %svl = rdsvl
4921//    %ptr2 = %ptr + %svl * %vecnum
4922//    %tileslice2 = %tileslice + %vecnum
4923//    ldr [%tileslice2, 7], [%ptr2, 7]
4924//    ldr [%tileslice2, 8], [%ptr2, 8]
4925// Case 5: The vecnum being an add of an immediate out of range is also handled,
4926// in which case the same remainder logic as case 3 is used.
4927SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad) {
4928  SDLoc DL(N);
4929
4930  SDValue TileSlice = N->getOperand(2);
4931  SDValue Base = N->getOperand(3);
4932  SDValue VecNum = N->getOperand(4);
4933  int32_t ConstAddend = 0;
4934  SDValue VarAddend = VecNum;
4935
4936  // If the vnum is an add of an immediate, we can fold it into the instruction
4937  if (VecNum.getOpcode() == ISD::ADD &&
4938      isa<ConstantSDNode>(VecNum.getOperand(1))) {
4939    ConstAddend = cast<ConstantSDNode>(VecNum.getOperand(1))->getSExtValue();
4940    VarAddend = VecNum.getOperand(0);
4941  } else if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) {
4942    ConstAddend = ImmNode->getSExtValue();
4943    VarAddend = SDValue();
4944  }
4945
4946  int32_t ImmAddend = ConstAddend % 16;
4947  if (int32_t C = (ConstAddend - ImmAddend)) {
4948    SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32);
4949    VarAddend = VarAddend
4950                    ? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal})
4951                    : CVal;
4952  }
4953
4954  if (VarAddend) {
4955    // Get the vector length that will be multiplied by vnum
4956    auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
4957                           DAG.getConstant(1, DL, MVT::i32));
4958
4959    // Multiply SVL and vnum then add it to the base
4960    SDValue Mul = DAG.getNode(
4961        ISD::MUL, DL, MVT::i64,
4962        {SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)});
4963    Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul});
4964    // Just add vnum to the tileslice
4965    TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend});
4966  }
4967
4968  return DAG.getNode(IsLoad ? AArch64ISD::SME_ZA_LDR : AArch64ISD::SME_ZA_STR,
4969                     DL, MVT::Other,
4970                     {/*Chain=*/N.getOperand(0), TileSlice, Base,
4971                      DAG.getTargetConstant(ImmAddend, DL, MVT::i32)});
4972}
4973
4974SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
4975                                                   SelectionDAG &DAG) const {
4976  unsigned IntNo = Op.getConstantOperandVal(1);
4977  SDLoc DL(Op);
4978  switch (IntNo) {
4979  default:
4980    return SDValue(); // Don't custom lower most intrinsics.
4981  case Intrinsic::aarch64_prefetch: {
4982    SDValue Chain = Op.getOperand(0);
4983    SDValue Addr = Op.getOperand(2);
4984
4985    unsigned IsWrite = Op.getConstantOperandVal(3);
4986    unsigned Locality = Op.getConstantOperandVal(4);
4987    unsigned IsStream = Op.getConstantOperandVal(5);
4988    unsigned IsData = Op.getConstantOperandVal(6);
4989    unsigned PrfOp = (IsWrite << 4) |    // Load/Store bit
4990                     (!IsData << 3) |    // IsDataCache bit
4991                     (Locality << 1) |   // Cache level bits
4992                     (unsigned)IsStream; // Stream bit
4993
4994    return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain,
4995                       DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr);
4996  }
4997  case Intrinsic::aarch64_sme_str:
4998  case Intrinsic::aarch64_sme_ldr: {
4999    return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr);
5000  }
5001  case Intrinsic::aarch64_sme_za_enable:
5002    return DAG.getNode(
5003        AArch64ISD::SMSTART, DL, MVT::Other,
5004        Op->getOperand(0), // Chain
5005        DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
5006        DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64));
5007  case Intrinsic::aarch64_sme_za_disable:
5008    return DAG.getNode(
5009        AArch64ISD::SMSTOP, DL, MVT::Other,
5010        Op->getOperand(0), // Chain
5011        DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
5012        DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64));
5013  }
5014}
5015
5016SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
5017                                                      SelectionDAG &DAG) const {
5018  unsigned IntNo = Op.getConstantOperandVal(1);
5019  SDLoc DL(Op);
5020  switch (IntNo) {
5021  default:
5022    return SDValue(); // Don't custom lower most intrinsics.
5023  case Intrinsic::aarch64_mops_memset_tag: {
5024    auto Node = cast<MemIntrinsicSDNode>(Op.getNode());
5025    SDValue Chain = Node->getChain();
5026    SDValue Dst = Op.getOperand(2);
5027    SDValue Val = Op.getOperand(3);
5028    Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
5029    SDValue Size = Op.getOperand(4);
5030    auto Alignment = Node->getMemOperand()->getAlign();
5031    bool IsVol = Node->isVolatile();
5032    auto DstPtrInfo = Node->getPointerInfo();
5033
5034    const auto &SDI =
5035        static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
5036    SDValue MS =
5037        SDI.EmitMOPS(AArch64ISD::MOPS_MEMSET_TAGGING, DAG, DL, Chain, Dst, Val,
5038                     Size, Alignment, IsVol, DstPtrInfo, MachinePointerInfo{});
5039
5040    // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
5041    // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
5042    // LowerOperationWrapper will complain that the number of results has
5043    // changed.
5044    return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);
5045  }
5046  }
5047}
5048
5049SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
5050                                                     SelectionDAG &DAG) const {
5051  unsigned IntNo = Op.getConstantOperandVal(0);
5052  SDLoc dl(Op);
5053  switch (IntNo) {
5054  default: return SDValue();    // Don't custom lower most intrinsics.
5055  case Intrinsic::thread_pointer: {
5056    EVT PtrVT = getPointerTy(DAG.getDataLayout());
5057    return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
5058  }
5059  case Intrinsic::aarch64_neon_abs: {
5060    EVT Ty = Op.getValueType();
5061    if (Ty == MVT::i64) {
5062      SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
5063                                   Op.getOperand(1));
5064      Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
5065      return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
5066    } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
5067      return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
5068    } else {
5069      report_fatal_error("Unexpected type for AArch64 NEON intrinic");
5070    }
5071  }
5072  case Intrinsic::aarch64_neon_pmull64: {
5073    SDValue LHS = Op.getOperand(1);
5074    SDValue RHS = Op.getOperand(2);
5075
5076    std::optional<uint64_t> LHSLane =
5077        getConstantLaneNumOfExtractHalfOperand(LHS);
5078    std::optional<uint64_t> RHSLane =
5079        getConstantLaneNumOfExtractHalfOperand(RHS);
5080
5081    assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1");
5082    assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1");
5083
5084    // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
5085    // instructions execute on SIMD registers. So canonicalize i64 to v1i64,
5086    // which ISel recognizes better. For example, generate a ldr into d*
5087    // registers as opposed to a GPR load followed by a fmov.
5088    auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane,
5089                                  std::optional<uint64_t> OtherLane,
5090                                  const SDLoc &dl,
5091                                  SelectionDAG &DAG) -> SDValue {
5092      // If the operand is an higher half itself, rewrite it to
5093      // extract_high_v2i64; this way aarch64_neon_pmull64 could
5094      // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
5095      if (NLane && *NLane == 1)
5096        return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
5097                           N.getOperand(0), DAG.getConstant(1, dl, MVT::i64));
5098
5099      // Operand N is not a higher half but the other operand is.
5100      if (OtherLane && *OtherLane == 1) {
5101        // If this operand is a lower half, rewrite it to
5102        // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
5103        // align lanes of two operands. A roundtrip sequence (to move from lane
5104        // 1 to lane 0) is like this:
5105        //   mov x8, v0.d[1]
5106        //   fmov d0, x8
5107        if (NLane && *NLane == 0)
5108          return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
5109                             DAG.getNode(AArch64ISD::DUPLANE64, dl, MVT::v2i64,
5110                                         N.getOperand(0),
5111                                         DAG.getConstant(0, dl, MVT::i64)),
5112                             DAG.getConstant(1, dl, MVT::i64));
5113
5114        // Otherwise just dup from main to all lanes.
5115        return DAG.getNode(AArch64ISD::DUP, dl, MVT::v1i64, N);
5116      }
5117
5118      // Neither operand is an extract of higher half, so codegen may just use
5119      // the non-high version of PMULL instruction. Use v1i64 to represent i64.
5120      assert(N.getValueType() == MVT::i64 &&
5121             "Intrinsic aarch64_neon_pmull64 requires i64 parameters");
5122      return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, N);
5123    };
5124
5125    LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, dl, DAG);
5126    RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, dl, DAG);
5127
5128    return DAG.getNode(AArch64ISD::PMULL, dl, Op.getValueType(), LHS, RHS);
5129  }
5130  case Intrinsic::aarch64_neon_smax:
5131    return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
5132                       Op.getOperand(1), Op.getOperand(2));
5133  case Intrinsic::aarch64_neon_umax:
5134    return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
5135                       Op.getOperand(1), Op.getOperand(2));
5136  case Intrinsic::aarch64_neon_smin:
5137    return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
5138                       Op.getOperand(1), Op.getOperand(2));
5139  case Intrinsic::aarch64_neon_umin:
5140    return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
5141                       Op.getOperand(1), Op.getOperand(2));
5142  case Intrinsic::aarch64_neon_scalar_sqxtn:
5143  case Intrinsic::aarch64_neon_scalar_sqxtun:
5144  case Intrinsic::aarch64_neon_scalar_uqxtn: {
5145    assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32);
5146    if (Op.getValueType() == MVT::i32)
5147      return DAG.getNode(ISD::BITCAST, dl, MVT::i32,
5148                         DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::f32,
5149                                     Op.getOperand(0),
5150                                     DAG.getNode(ISD::BITCAST, dl, MVT::f64,
5151                                                 Op.getOperand(1))));
5152    return SDValue();
5153  }
5154  case Intrinsic::aarch64_sve_whilelo:
5155    return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/true,
5156                         /*IsEqual=*/false);
5157  case Intrinsic::aarch64_sve_whilelt:
5158    return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/true,
5159                         /*IsEqual=*/false);
5160  case Intrinsic::aarch64_sve_whilels:
5161    return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/true,
5162                         /*IsEqual=*/true);
5163  case Intrinsic::aarch64_sve_whilele:
5164    return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/true,
5165                         /*IsEqual=*/true);
5166  case Intrinsic::aarch64_sve_whilege:
5167    return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/false,
5168                         /*IsEqual=*/true);
5169  case Intrinsic::aarch64_sve_whilegt:
5170    return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/false,
5171                         /*IsEqual=*/false);
5172  case Intrinsic::aarch64_sve_whilehs:
5173    return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/false,
5174                         /*IsEqual=*/true);
5175  case Intrinsic::aarch64_sve_whilehi:
5176    return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/false,
5177                         /*IsEqual=*/false);
5178  case Intrinsic::aarch64_sve_sunpkhi:
5179    return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(),
5180                       Op.getOperand(1));
5181  case Intrinsic::aarch64_sve_sunpklo:
5182    return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(),
5183                       Op.getOperand(1));
5184  case Intrinsic::aarch64_sve_uunpkhi:
5185    return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(),
5186                       Op.getOperand(1));
5187  case Intrinsic::aarch64_sve_uunpklo:
5188    return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(),
5189                       Op.getOperand(1));
5190  case Intrinsic::aarch64_sve_clasta_n:
5191    return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(),
5192                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5193  case Intrinsic::aarch64_sve_clastb_n:
5194    return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(),
5195                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5196  case Intrinsic::aarch64_sve_lasta:
5197    return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(),
5198                       Op.getOperand(1), Op.getOperand(2));
5199  case Intrinsic::aarch64_sve_lastb:
5200    return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(),
5201                       Op.getOperand(1), Op.getOperand(2));
5202  case Intrinsic::aarch64_sve_rev:
5203    return DAG.getNode(ISD::VECTOR_REVERSE, dl, Op.getValueType(),
5204                       Op.getOperand(1));
5205  case Intrinsic::aarch64_sve_tbl:
5206    return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(),
5207                       Op.getOperand(1), Op.getOperand(2));
5208  case Intrinsic::aarch64_sve_trn1:
5209    return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(),
5210                       Op.getOperand(1), Op.getOperand(2));
5211  case Intrinsic::aarch64_sve_trn2:
5212    return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(),
5213                       Op.getOperand(1), Op.getOperand(2));
5214  case Intrinsic::aarch64_sve_uzp1:
5215    return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(),
5216                       Op.getOperand(1), Op.getOperand(2));
5217  case Intrinsic::aarch64_sve_uzp2:
5218    return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(),
5219                       Op.getOperand(1), Op.getOperand(2));
5220  case Intrinsic::aarch64_sve_zip1:
5221    return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(),
5222                       Op.getOperand(1), Op.getOperand(2));
5223  case Intrinsic::aarch64_sve_zip2:
5224    return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(),
5225                       Op.getOperand(1), Op.getOperand(2));
5226  case Intrinsic::aarch64_sve_splice:
5227    return DAG.getNode(AArch64ISD::SPLICE, dl, Op.getValueType(),
5228                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5229  case Intrinsic::aarch64_sve_ptrue:
5230    return getPTrue(DAG, dl, Op.getValueType(), Op.getConstantOperandVal(1));
5231  case Intrinsic::aarch64_sve_clz:
5232    return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(),
5233                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5234  case Intrinsic::aarch64_sme_cntsb:
5235    return DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5236                       DAG.getConstant(1, dl, MVT::i32));
5237  case Intrinsic::aarch64_sme_cntsh: {
5238    SDValue One = DAG.getConstant(1, dl, MVT::i32);
5239    SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), One);
5240    return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes, One);
5241  }
5242  case Intrinsic::aarch64_sme_cntsw: {
5243    SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5244                                DAG.getConstant(1, dl, MVT::i32));
5245    return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
5246                       DAG.getConstant(2, dl, MVT::i32));
5247  }
5248  case Intrinsic::aarch64_sme_cntsd: {
5249    SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5250                                DAG.getConstant(1, dl, MVT::i32));
5251    return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
5252                       DAG.getConstant(3, dl, MVT::i32));
5253  }
5254  case Intrinsic::aarch64_sve_cnt: {
5255    SDValue Data = Op.getOperand(3);
5256    // CTPOP only supports integer operands.
5257    if (Data.getValueType().isFloatingPoint())
5258      Data = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Data);
5259    return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, dl, Op.getValueType(),
5260                       Op.getOperand(2), Data, Op.getOperand(1));
5261  }
5262  case Intrinsic::aarch64_sve_dupq_lane:
5263    return LowerDUPQLane(Op, DAG);
5264  case Intrinsic::aarch64_sve_convert_from_svbool:
5265    if (Op.getValueType() == MVT::aarch64svcount)
5266      return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Op.getOperand(1));
5267    return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG);
5268  case Intrinsic::aarch64_sve_convert_to_svbool:
5269    if (Op.getOperand(1).getValueType() == MVT::aarch64svcount)
5270      return DAG.getNode(ISD::BITCAST, dl, MVT::nxv16i1, Op.getOperand(1));
5271    return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG);
5272  case Intrinsic::aarch64_sve_fneg:
5273    return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(),
5274                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5275  case Intrinsic::aarch64_sve_frintp:
5276    return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, dl, Op.getValueType(),
5277                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5278  case Intrinsic::aarch64_sve_frintm:
5279    return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, dl, Op.getValueType(),
5280                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5281  case Intrinsic::aarch64_sve_frinti:
5282    return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl, Op.getValueType(),
5283                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5284  case Intrinsic::aarch64_sve_frintx:
5285    return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, dl, Op.getValueType(),
5286                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5287  case Intrinsic::aarch64_sve_frinta:
5288    return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, dl, Op.getValueType(),
5289                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5290  case Intrinsic::aarch64_sve_frintn:
5291    return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, dl, Op.getValueType(),
5292                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5293  case Intrinsic::aarch64_sve_frintz:
5294    return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, dl, Op.getValueType(),
5295                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5296  case Intrinsic::aarch64_sve_ucvtf:
5297    return DAG.getNode(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU, dl,
5298                       Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5299                       Op.getOperand(1));
5300  case Intrinsic::aarch64_sve_scvtf:
5301    return DAG.getNode(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU, dl,
5302                       Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5303                       Op.getOperand(1));
5304  case Intrinsic::aarch64_sve_fcvtzu:
5305    return DAG.getNode(AArch64ISD::FCVTZU_MERGE_PASSTHRU, dl,
5306                       Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5307                       Op.getOperand(1));
5308  case Intrinsic::aarch64_sve_fcvtzs:
5309    return DAG.getNode(AArch64ISD::FCVTZS_MERGE_PASSTHRU, dl,
5310                       Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5311                       Op.getOperand(1));
5312  case Intrinsic::aarch64_sve_fsqrt:
5313    return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, dl, Op.getValueType(),
5314                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5315  case Intrinsic::aarch64_sve_frecpx:
5316    return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, dl, Op.getValueType(),
5317                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5318  case Intrinsic::aarch64_sve_frecpe_x:
5319    return DAG.getNode(AArch64ISD::FRECPE, dl, Op.getValueType(),
5320                       Op.getOperand(1));
5321  case Intrinsic::aarch64_sve_frecps_x:
5322    return DAG.getNode(AArch64ISD::FRECPS, dl, Op.getValueType(),
5323                       Op.getOperand(1), Op.getOperand(2));
5324  case Intrinsic::aarch64_sve_frsqrte_x:
5325    return DAG.getNode(AArch64ISD::FRSQRTE, dl, Op.getValueType(),
5326                       Op.getOperand(1));
5327  case Intrinsic::aarch64_sve_frsqrts_x:
5328    return DAG.getNode(AArch64ISD::FRSQRTS, dl, Op.getValueType(),
5329                       Op.getOperand(1), Op.getOperand(2));
5330  case Intrinsic::aarch64_sve_fabs:
5331    return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, dl, Op.getValueType(),
5332                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5333  case Intrinsic::aarch64_sve_abs:
5334    return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, dl, Op.getValueType(),
5335                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5336  case Intrinsic::aarch64_sve_neg:
5337    return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, dl, Op.getValueType(),
5338                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5339  case Intrinsic::aarch64_sve_insr: {
5340    SDValue Scalar = Op.getOperand(2);
5341    EVT ScalarTy = Scalar.getValueType();
5342    if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
5343      Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
5344
5345    return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(),
5346                       Op.getOperand(1), Scalar);
5347  }
5348  case Intrinsic::aarch64_sve_rbit:
5349    return DAG.getNode(AArch64ISD::BITREVERSE_MERGE_PASSTHRU, dl,
5350                       Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5351                       Op.getOperand(1));
5352  case Intrinsic::aarch64_sve_revb:
5353    return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(),
5354                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5355  case Intrinsic::aarch64_sve_revh:
5356    return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, dl, Op.getValueType(),
5357                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5358  case Intrinsic::aarch64_sve_revw:
5359    return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, dl, Op.getValueType(),
5360                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5361  case Intrinsic::aarch64_sve_revd:
5362    return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, dl, Op.getValueType(),
5363                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5364  case Intrinsic::aarch64_sve_sxtb:
5365    return DAG.getNode(
5366        AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
5367        Op.getOperand(2), Op.getOperand(3),
5368        DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
5369        Op.getOperand(1));
5370  case Intrinsic::aarch64_sve_sxth:
5371    return DAG.getNode(
5372        AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
5373        Op.getOperand(2), Op.getOperand(3),
5374        DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
5375        Op.getOperand(1));
5376  case Intrinsic::aarch64_sve_sxtw:
5377    return DAG.getNode(
5378        AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
5379        Op.getOperand(2), Op.getOperand(3),
5380        DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
5381        Op.getOperand(1));
5382  case Intrinsic::aarch64_sve_uxtb:
5383    return DAG.getNode(
5384        AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
5385        Op.getOperand(2), Op.getOperand(3),
5386        DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
5387        Op.getOperand(1));
5388  case Intrinsic::aarch64_sve_uxth:
5389    return DAG.getNode(
5390        AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
5391        Op.getOperand(2), Op.getOperand(3),
5392        DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
5393        Op.getOperand(1));
5394  case Intrinsic::aarch64_sve_uxtw:
5395    return DAG.getNode(
5396        AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
5397        Op.getOperand(2), Op.getOperand(3),
5398        DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
5399        Op.getOperand(1));
5400  case Intrinsic::localaddress: {
5401    const auto &MF = DAG.getMachineFunction();
5402    const auto *RegInfo = Subtarget->getRegisterInfo();
5403    unsigned Reg = RegInfo->getLocalAddressRegister(MF);
5404    return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
5405                              Op.getSimpleValueType());
5406  }
5407
5408  case Intrinsic::eh_recoverfp: {
5409    // FIXME: This needs to be implemented to correctly handle highly aligned
5410    // stack objects. For now we simply return the incoming FP. Refer D53541
5411    // for more details.
5412    SDValue FnOp = Op.getOperand(1);
5413    SDValue IncomingFPOp = Op.getOperand(2);
5414    GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
5415    auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
5416    if (!Fn)
5417      report_fatal_error(
5418          "llvm.eh.recoverfp must take a function as the first argument");
5419    return IncomingFPOp;
5420  }
5421
5422  case Intrinsic::aarch64_neon_vsri:
5423  case Intrinsic::aarch64_neon_vsli:
5424  case Intrinsic::aarch64_sve_sri:
5425  case Intrinsic::aarch64_sve_sli: {
5426    EVT Ty = Op.getValueType();
5427
5428    if (!Ty.isVector())
5429      report_fatal_error("Unexpected type for aarch64_neon_vsli");
5430
5431    assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
5432
5433    bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri ||
5434                        IntNo == Intrinsic::aarch64_sve_sri;
5435    unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
5436    return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2),
5437                       Op.getOperand(3));
5438  }
5439
5440  case Intrinsic::aarch64_neon_srhadd:
5441  case Intrinsic::aarch64_neon_urhadd:
5442  case Intrinsic::aarch64_neon_shadd:
5443  case Intrinsic::aarch64_neon_uhadd: {
5444    bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
5445                        IntNo == Intrinsic::aarch64_neon_shadd);
5446    bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
5447                          IntNo == Intrinsic::aarch64_neon_urhadd);
5448    unsigned Opcode = IsSignedAdd
5449                          ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
5450                          : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
5451    return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
5452                       Op.getOperand(2));
5453  }
5454  case Intrinsic::aarch64_neon_saddlp:
5455  case Intrinsic::aarch64_neon_uaddlp: {
5456    unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
5457                          ? AArch64ISD::UADDLP
5458                          : AArch64ISD::SADDLP;
5459    return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1));
5460  }
5461  case Intrinsic::aarch64_neon_sdot:
5462  case Intrinsic::aarch64_neon_udot:
5463  case Intrinsic::aarch64_sve_sdot:
5464  case Intrinsic::aarch64_sve_udot: {
5465    unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
5466                       IntNo == Intrinsic::aarch64_sve_udot)
5467                          ? AArch64ISD::UDOT
5468                          : AArch64ISD::SDOT;
5469    return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
5470                       Op.getOperand(2), Op.getOperand(3));
5471  }
5472  case Intrinsic::get_active_lane_mask: {
5473    SDValue ID =
5474        DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64);
5475    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), ID,
5476                       Op.getOperand(1), Op.getOperand(2));
5477  }
5478  case Intrinsic::aarch64_neon_uaddlv: {
5479    EVT OpVT = Op.getOperand(1).getValueType();
5480    EVT ResVT = Op.getValueType();
5481    if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
5482                              OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) {
5483      // In order to avoid insert_subvector, used v4i32 than v2i32.
5484      SDValue UADDLV =
5485          DAG.getNode(AArch64ISD::UADDLV, dl, MVT::v4i32, Op.getOperand(1));
5486      SDValue EXTRACT_VEC_ELT =
5487          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, UADDLV,
5488                      DAG.getConstant(0, dl, MVT::i64));
5489      return EXTRACT_VEC_ELT;
5490    }
5491    return SDValue();
5492  }
5493  case Intrinsic::experimental_cttz_elts: {
5494    SDValue NewCttzElts =
5495        DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, Op.getOperand(1));
5496
5497    return DAG.getZExtOrTrunc(NewCttzElts, dl, Op.getValueType());
5498  }
5499  }
5500}
5501
5502bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
5503  if (VT.getVectorElementType() == MVT::i8 ||
5504      VT.getVectorElementType() == MVT::i16) {
5505    EltTy = MVT::i32;
5506    return true;
5507  }
5508  return false;
5509}
5510
5511bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
5512                                                          EVT DataVT) const {
5513  const EVT IndexVT = Extend.getOperand(0).getValueType();
5514  // SVE only supports implicit extension of 32-bit indices.
5515  if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
5516    return false;
5517
5518  // Indices cannot be smaller than the main data type.
5519  if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
5520    return false;
5521
5522  // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
5523  // element container type, which would violate the previous clause.
5524  return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
5525}
5526
5527bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
5528  EVT ExtVT = ExtVal.getValueType();
5529  if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
5530    return false;
5531
5532  // It may be worth creating extending masked loads if there are multiple
5533  // masked loads using the same predicate. That way we'll end up creating
5534  // extending masked loads that may then get split by the legaliser. This
5535  // results in just one set of predicate unpacks at the start, instead of
5536  // multiple sets of vector unpacks after each load.
5537  if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal->getOperand(0))) {
5538    if (!isLoadExtLegalOrCustom(ISD::ZEXTLOAD, ExtVT, Ld->getValueType(0))) {
5539      // Disable extending masked loads for fixed-width for now, since the code
5540      // quality doesn't look great.
5541      if (!ExtVT.isScalableVector())
5542        return false;
5543
5544      unsigned NumExtMaskedLoads = 0;
5545      for (auto *U : Ld->getMask()->uses())
5546        if (isa<MaskedLoadSDNode>(U))
5547          NumExtMaskedLoads++;
5548
5549      if (NumExtMaskedLoads <= 1)
5550        return false;
5551    }
5552  }
5553
5554  return true;
5555}
5556
5557unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
5558  std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
5559      {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
5560       AArch64ISD::GLD1_MERGE_ZERO},
5561      {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
5562       AArch64ISD::GLD1_UXTW_MERGE_ZERO},
5563      {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
5564       AArch64ISD::GLD1_MERGE_ZERO},
5565      {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
5566       AArch64ISD::GLD1_SXTW_MERGE_ZERO},
5567      {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
5568       AArch64ISD::GLD1_SCALED_MERGE_ZERO},
5569      {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
5570       AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO},
5571      {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
5572       AArch64ISD::GLD1_SCALED_MERGE_ZERO},
5573      {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
5574       AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO},
5575  };
5576  auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
5577  return AddrModes.find(Key)->second;
5578}
5579
5580unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
5581  switch (Opcode) {
5582  default:
5583    llvm_unreachable("unimplemented opcode");
5584    return Opcode;
5585  case AArch64ISD::GLD1_MERGE_ZERO:
5586    return AArch64ISD::GLD1S_MERGE_ZERO;
5587  case AArch64ISD::GLD1_IMM_MERGE_ZERO:
5588    return AArch64ISD::GLD1S_IMM_MERGE_ZERO;
5589  case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
5590    return AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
5591  case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
5592    return AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
5593  case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
5594    return AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
5595  case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
5596    return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
5597  case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
5598    return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
5599  }
5600}
5601
5602SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
5603                                            SelectionDAG &DAG) const {
5604  MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
5605
5606  SDLoc DL(Op);
5607  SDValue Chain = MGT->getChain();
5608  SDValue PassThru = MGT->getPassThru();
5609  SDValue Mask = MGT->getMask();
5610  SDValue BasePtr = MGT->getBasePtr();
5611  SDValue Index = MGT->getIndex();
5612  SDValue Scale = MGT->getScale();
5613  EVT VT = Op.getValueType();
5614  EVT MemVT = MGT->getMemoryVT();
5615  ISD::LoadExtType ExtType = MGT->getExtensionType();
5616  ISD::MemIndexType IndexType = MGT->getIndexType();
5617
5618  // SVE supports zero (and so undef) passthrough values only, everything else
5619  // must be handled manually by an explicit select on the load's output.
5620  if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) {
5621    SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
5622    SDValue Load =
5623        DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
5624                            MGT->getMemOperand(), IndexType, ExtType);
5625    SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru);
5626    return DAG.getMergeValues({Select, Load.getValue(1)}, DL);
5627  }
5628
5629  bool IsScaled = MGT->isIndexScaled();
5630  bool IsSigned = MGT->isIndexSigned();
5631
5632  // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
5633  // must be calculated before hand.
5634  uint64_t ScaleVal = Scale->getAsZExtVal();
5635  if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
5636    assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
5637    EVT IndexVT = Index.getValueType();
5638    Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
5639                        DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
5640    Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
5641
5642    SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
5643    return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
5644                               MGT->getMemOperand(), IndexType, ExtType);
5645  }
5646
5647  // Lower fixed length gather to a scalable equivalent.
5648  if (VT.isFixedLengthVector()) {
5649    assert(Subtarget->useSVEForFixedLengthVectors() &&
5650           "Cannot lower when not using SVE for fixed vectors!");
5651
5652    // NOTE: Handle floating-point as if integer then bitcast the result.
5653    EVT DataVT = VT.changeVectorElementTypeToInteger();
5654    MemVT = MemVT.changeVectorElementTypeToInteger();
5655
5656    // Find the smallest integer fixed length vector we can use for the gather.
5657    EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
5658    if (DataVT.getVectorElementType() == MVT::i64 ||
5659        Index.getValueType().getVectorElementType() == MVT::i64 ||
5660        Mask.getValueType().getVectorElementType() == MVT::i64)
5661      PromotedVT = VT.changeVectorElementType(MVT::i64);
5662
5663    // Promote vector operands except for passthrough, which we know is either
5664    // undef or zero, and thus best constructed directly.
5665    unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5666    Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
5667    Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
5668
5669    // A promoted result type forces the need for an extending load.
5670    if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
5671      ExtType = ISD::EXTLOAD;
5672
5673    EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
5674
5675    // Convert fixed length vector operands to scalable.
5676    MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
5677    Index = convertToScalableVector(DAG, ContainerVT, Index);
5678    Mask = convertFixedMaskToScalableVector(Mask, DAG);
5679    PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT)
5680                                   : DAG.getConstant(0, DL, ContainerVT);
5681
5682    // Emit equivalent scalable vector gather.
5683    SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
5684    SDValue Load =
5685        DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,
5686                            Ops, MGT->getMemOperand(), IndexType, ExtType);
5687
5688    // Extract fixed length data then convert to the required result type.
5689    SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load);
5690    Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result);
5691    if (VT.isFloatingPoint())
5692      Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
5693
5694    return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
5695  }
5696
5697  // Everything else is legal.
5698  return Op;
5699}
5700
5701SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
5702                                             SelectionDAG &DAG) const {
5703  MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
5704
5705  SDLoc DL(Op);
5706  SDValue Chain = MSC->getChain();
5707  SDValue StoreVal = MSC->getValue();
5708  SDValue Mask = MSC->getMask();
5709  SDValue BasePtr = MSC->getBasePtr();
5710  SDValue Index = MSC->getIndex();
5711  SDValue Scale = MSC->getScale();
5712  EVT VT = StoreVal.getValueType();
5713  EVT MemVT = MSC->getMemoryVT();
5714  ISD::MemIndexType IndexType = MSC->getIndexType();
5715  bool Truncating = MSC->isTruncatingStore();
5716
5717  bool IsScaled = MSC->isIndexScaled();
5718  bool IsSigned = MSC->isIndexSigned();
5719
5720  // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
5721  // must be calculated before hand.
5722  uint64_t ScaleVal = Scale->getAsZExtVal();
5723  if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
5724    assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
5725    EVT IndexVT = Index.getValueType();
5726    Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
5727                        DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
5728    Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
5729
5730    SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
5731    return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
5732                                MSC->getMemOperand(), IndexType, Truncating);
5733  }
5734
5735  // Lower fixed length scatter to a scalable equivalent.
5736  if (VT.isFixedLengthVector()) {
5737    assert(Subtarget->useSVEForFixedLengthVectors() &&
5738           "Cannot lower when not using SVE for fixed vectors!");
5739
5740    // Once bitcast we treat floating-point scatters as if integer.
5741    if (VT.isFloatingPoint()) {
5742      VT = VT.changeVectorElementTypeToInteger();
5743      MemVT = MemVT.changeVectorElementTypeToInteger();
5744      StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal);
5745    }
5746
5747    // Find the smallest integer fixed length vector we can use for the scatter.
5748    EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
5749    if (VT.getVectorElementType() == MVT::i64 ||
5750        Index.getValueType().getVectorElementType() == MVT::i64 ||
5751        Mask.getValueType().getVectorElementType() == MVT::i64)
5752      PromotedVT = VT.changeVectorElementType(MVT::i64);
5753
5754    // Promote vector operands.
5755    unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5756    Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
5757    Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
5758    StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal);
5759
5760    // A promoted value type forces the need for a truncating store.
5761    if (PromotedVT != VT)
5762      Truncating = true;
5763
5764    EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
5765
5766    // Convert fixed length vector operands to scalable.
5767    MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
5768    Index = convertToScalableVector(DAG, ContainerVT, Index);
5769    Mask = convertFixedMaskToScalableVector(Mask, DAG);
5770    StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal);
5771
5772    // Emit equivalent scalable vector scatter.
5773    SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
5774    return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
5775                                MSC->getMemOperand(), IndexType, Truncating);
5776  }
5777
5778  // Everything else is legal.
5779  return Op;
5780}
5781
5782SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
5783  SDLoc DL(Op);
5784  MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op);
5785  assert(LoadNode && "Expected custom lowering of a masked load node");
5786  EVT VT = Op->getValueType(0);
5787
5788  if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
5789    return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
5790
5791  SDValue PassThru = LoadNode->getPassThru();
5792  SDValue Mask = LoadNode->getMask();
5793
5794  if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
5795    return Op;
5796
5797  SDValue Load = DAG.getMaskedLoad(
5798      VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
5799      LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),
5800      LoadNode->getMemOperand(), LoadNode->getAddressingMode(),
5801      LoadNode->getExtensionType());
5802
5803  SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);
5804
5805  return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
5806}
5807
5808// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
5809static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
5810                                        EVT VT, EVT MemVT,
5811                                        SelectionDAG &DAG) {
5812  assert(VT.isVector() && "VT should be a vector type");
5813  assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
5814
5815  SDValue Value = ST->getValue();
5816
5817  // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
5818  // the word lane which represent the v4i8 subvector.  It optimizes the store
5819  // to:
5820  //
5821  //   xtn  v0.8b, v0.8h
5822  //   str  s0, [x0]
5823
5824  SDValue Undef = DAG.getUNDEF(MVT::i16);
5825  SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
5826                                        {Undef, Undef, Undef, Undef});
5827
5828  SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
5829                                 Value, UndefVec);
5830  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
5831
5832  Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
5833  SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
5834                                     Trunc, DAG.getConstant(0, DL, MVT::i64));
5835
5836  return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
5837                      ST->getBasePtr(), ST->getMemOperand());
5838}
5839
5840// Custom lowering for any store, vector or scalar and/or default or with
5841// a truncate operations.  Currently only custom lower truncate operation
5842// from vector v4i16 to v4i8 or volatile stores of i128.
5843SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
5844                                          SelectionDAG &DAG) const {
5845  SDLoc Dl(Op);
5846  StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
5847  assert (StoreNode && "Can only custom lower store nodes");
5848
5849  SDValue Value = StoreNode->getValue();
5850
5851  EVT VT = Value.getValueType();
5852  EVT MemVT = StoreNode->getMemoryVT();
5853
5854  if (VT.isVector()) {
5855    if (useSVEForFixedLengthVectorVT(
5856            VT,
5857            /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
5858      return LowerFixedLengthVectorStoreToSVE(Op, DAG);
5859
5860    unsigned AS = StoreNode->getAddressSpace();
5861    Align Alignment = StoreNode->getAlign();
5862    if (Alignment < MemVT.getStoreSize() &&
5863        !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
5864                                        StoreNode->getMemOperand()->getFlags(),
5865                                        nullptr)) {
5866      return scalarizeVectorStore(StoreNode, DAG);
5867    }
5868
5869    if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
5870        MemVT == MVT::v4i8) {
5871      return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
5872    }
5873    // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
5874    // the custom lowering, as there are no un-paired non-temporal stores and
5875    // legalization will break up 256 bit inputs.
5876    ElementCount EC = MemVT.getVectorElementCount();
5877    if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
5878        EC.isKnownEven() && DAG.getDataLayout().isLittleEndian() &&
5879        (MemVT.getScalarSizeInBits() == 8u ||
5880         MemVT.getScalarSizeInBits() == 16u ||
5881         MemVT.getScalarSizeInBits() == 32u ||
5882         MemVT.getScalarSizeInBits() == 64u)) {
5883      SDValue Lo =
5884          DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
5885                      MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
5886                      StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
5887      SDValue Hi =
5888          DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
5889                      MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
5890                      StoreNode->getValue(),
5891                      DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
5892      SDValue Result = DAG.getMemIntrinsicNode(
5893          AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
5894          {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
5895          StoreNode->getMemoryVT(), StoreNode->getMemOperand());
5896      return Result;
5897    }
5898  } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
5899    return LowerStore128(Op, DAG);
5900  } else if (MemVT == MVT::i64x8) {
5901    SDValue Value = StoreNode->getValue();
5902    assert(Value->getValueType(0) == MVT::i64x8);
5903    SDValue Chain = StoreNode->getChain();
5904    SDValue Base = StoreNode->getBasePtr();
5905    EVT PtrVT = Base.getValueType();
5906    for (unsigned i = 0; i < 8; i++) {
5907      SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
5908                                 Value, DAG.getConstant(i, Dl, MVT::i32));
5909      SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
5910                                DAG.getConstant(i * 8, Dl, PtrVT));
5911      Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
5912                           StoreNode->getOriginalAlign());
5913    }
5914    return Chain;
5915  }
5916
5917  return SDValue();
5918}
5919
5920/// Lower atomic or volatile 128-bit stores to a single STP instruction.
5921SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
5922                                             SelectionDAG &DAG) const {
5923  MemSDNode *StoreNode = cast<MemSDNode>(Op);
5924  assert(StoreNode->getMemoryVT() == MVT::i128);
5925  assert(StoreNode->isVolatile() || StoreNode->isAtomic());
5926
5927  bool IsStoreRelease =
5928      StoreNode->getMergedOrdering() == AtomicOrdering::Release;
5929  if (StoreNode->isAtomic())
5930    assert((Subtarget->hasFeature(AArch64::FeatureLSE2) &&
5931            Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) ||
5932           StoreNode->getMergedOrdering() == AtomicOrdering::Unordered ||
5933           StoreNode->getMergedOrdering() == AtomicOrdering::Monotonic);
5934
5935  SDValue Value = (StoreNode->getOpcode() == ISD::STORE ||
5936                   StoreNode->getOpcode() == ISD::ATOMIC_STORE)
5937                      ? StoreNode->getOperand(1)
5938                      : StoreNode->getOperand(2);
5939  SDLoc DL(Op);
5940  auto StoreValue = DAG.SplitScalar(Value, DL, MVT::i64, MVT::i64);
5941  unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP;
5942  if (DAG.getDataLayout().isBigEndian())
5943    std::swap(StoreValue.first, StoreValue.second);
5944  SDValue Result = DAG.getMemIntrinsicNode(
5945      Opcode, DL, DAG.getVTList(MVT::Other),
5946      {StoreNode->getChain(), StoreValue.first, StoreValue.second,
5947       StoreNode->getBasePtr()},
5948      StoreNode->getMemoryVT(), StoreNode->getMemOperand());
5949  return Result;
5950}
5951
5952SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
5953                                         SelectionDAG &DAG) const {
5954  SDLoc DL(Op);
5955  LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
5956  assert(LoadNode && "Expected custom lowering of a load node");
5957
5958  if (LoadNode->getMemoryVT() == MVT::i64x8) {
5959    SmallVector<SDValue, 8> Ops;
5960    SDValue Base = LoadNode->getBasePtr();
5961    SDValue Chain = LoadNode->getChain();
5962    EVT PtrVT = Base.getValueType();
5963    for (unsigned i = 0; i < 8; i++) {
5964      SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
5965                                DAG.getConstant(i * 8, DL, PtrVT));
5966      SDValue Part = DAG.getLoad(MVT::i64, DL, Chain, Ptr,
5967                                 LoadNode->getPointerInfo(),
5968                                 LoadNode->getOriginalAlign());
5969      Ops.push_back(Part);
5970      Chain = SDValue(Part.getNode(), 1);
5971    }
5972    SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
5973    return DAG.getMergeValues({Loaded, Chain}, DL);
5974  }
5975
5976  // Custom lowering for extending v4i8 vector loads.
5977  EVT VT = Op->getValueType(0);
5978  assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
5979
5980  if (LoadNode->getMemoryVT() != MVT::v4i8)
5981    return SDValue();
5982
5983  unsigned ExtType;
5984  if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
5985    ExtType = ISD::SIGN_EXTEND;
5986  else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
5987           LoadNode->getExtensionType() == ISD::EXTLOAD)
5988    ExtType = ISD::ZERO_EXTEND;
5989  else
5990    return SDValue();
5991
5992  SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
5993                             LoadNode->getBasePtr(), MachinePointerInfo());
5994  SDValue Chain = Load.getValue(1);
5995  SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
5996  SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
5997  SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
5998  Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
5999                    DAG.getConstant(0, DL, MVT::i64));
6000  if (VT == MVT::v4i32)
6001    Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
6002  return DAG.getMergeValues({Ext, Chain}, DL);
6003}
6004
6005// Generate SUBS and CSEL for integer abs.
6006SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
6007  MVT VT = Op.getSimpleValueType();
6008
6009  if (VT.isVector())
6010    return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
6011
6012  SDLoc DL(Op);
6013  SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
6014                            Op.getOperand(0));
6015  // Generate SUBS & CSEL.
6016  SDValue Cmp =
6017      DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
6018                  Op.getOperand(0), DAG.getConstant(0, DL, VT));
6019  return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
6020                     DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
6021                     Cmp.getValue(1));
6022}
6023
6024static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) {
6025  SDValue Chain = Op.getOperand(0);
6026  SDValue Cond = Op.getOperand(1);
6027  SDValue Dest = Op.getOperand(2);
6028
6029  AArch64CC::CondCode CC;
6030  if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) {
6031    SDLoc dl(Op);
6032    SDValue CCVal = DAG.getConstant(CC, dl, MVT::i32);
6033    return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
6034                       Cmp);
6035  }
6036
6037  return SDValue();
6038}
6039
6040// Treat FSHR with constant shifts as legal operation, otherwise it is expanded
6041// FSHL is converted to FSHR before deciding what to do with it
6042static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG) {
6043  SDValue Shifts = Op.getOperand(2);
6044  // Check if the shift amount is a constant
6045  // If opcode is FSHL, convert it to FSHR
6046  if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Shifts)) {
6047    SDLoc DL(Op);
6048    MVT VT = Op.getSimpleValueType();
6049
6050    if (Op.getOpcode() == ISD::FSHL) {
6051      unsigned int NewShiftNo =
6052          VT.getFixedSizeInBits() - ShiftNo->getZExtValue();
6053      return DAG.getNode(
6054          ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
6055          DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
6056    } else if (Op.getOpcode() == ISD::FSHR) {
6057      return Op;
6058    }
6059  }
6060
6061  return SDValue();
6062}
6063
6064static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG) {
6065  SDValue X = Op.getOperand(0);
6066  EVT XScalarTy = X.getValueType();
6067  SDValue Exp = Op.getOperand(1);
6068
6069  SDLoc DL(Op);
6070  EVT XVT, ExpVT;
6071  switch (Op.getSimpleValueType().SimpleTy) {
6072  default:
6073    return SDValue();
6074  case MVT::f16:
6075    X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);
6076    [[fallthrough]];
6077  case MVT::f32:
6078    XVT = MVT::nxv4f32;
6079    ExpVT = MVT::nxv4i32;
6080    break;
6081  case MVT::f64:
6082    XVT = MVT::nxv2f64;
6083    ExpVT = MVT::nxv2i64;
6084    Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Exp);
6085    break;
6086  }
6087
6088  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
6089  SDValue VX =
6090      DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero);
6091  SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT,
6092                             DAG.getUNDEF(ExpVT), Exp, Zero);
6093  SDValue VPg = getPTrue(DAG, DL, XVT.changeVectorElementType(MVT::i1),
6094                         AArch64SVEPredPattern::all);
6095  SDValue FScale =
6096      DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, XVT,
6097                  DAG.getConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64),
6098                  VPg, VX, VExp);
6099  SDValue Final =
6100      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), FScale, Zero);
6101  if (X.getValueType() != XScalarTy)
6102    Final = DAG.getNode(ISD::FP_ROUND, DL, XScalarTy, Final,
6103                        DAG.getIntPtrConstant(1, SDLoc(Op)));
6104  return Final;
6105}
6106
6107SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
6108                                              SelectionDAG &DAG) const {
6109  LLVM_DEBUG(dbgs() << "Custom lowering: ");
6110  LLVM_DEBUG(Op.dump());
6111
6112  switch (Op.getOpcode()) {
6113  default:
6114    llvm_unreachable("unimplemented operand");
6115    return SDValue();
6116  case ISD::BITCAST:
6117    return LowerBITCAST(Op, DAG);
6118  case ISD::GlobalAddress:
6119    return LowerGlobalAddress(Op, DAG);
6120  case ISD::GlobalTLSAddress:
6121    return LowerGlobalTLSAddress(Op, DAG);
6122  case ISD::SETCC:
6123  case ISD::STRICT_FSETCC:
6124  case ISD::STRICT_FSETCCS:
6125    return LowerSETCC(Op, DAG);
6126  case ISD::SETCCCARRY:
6127    return LowerSETCCCARRY(Op, DAG);
6128  case ISD::BRCOND:
6129    return LowerBRCOND(Op, DAG);
6130  case ISD::BR_CC:
6131    return LowerBR_CC(Op, DAG);
6132  case ISD::SELECT:
6133    return LowerSELECT(Op, DAG);
6134  case ISD::SELECT_CC:
6135    return LowerSELECT_CC(Op, DAG);
6136  case ISD::JumpTable:
6137    return LowerJumpTable(Op, DAG);
6138  case ISD::BR_JT:
6139    return LowerBR_JT(Op, DAG);
6140  case ISD::ConstantPool:
6141    return LowerConstantPool(Op, DAG);
6142  case ISD::BlockAddress:
6143    return LowerBlockAddress(Op, DAG);
6144  case ISD::VASTART:
6145    return LowerVASTART(Op, DAG);
6146  case ISD::VACOPY:
6147    return LowerVACOPY(Op, DAG);
6148  case ISD::VAARG:
6149    return LowerVAARG(Op, DAG);
6150  case ISD::UADDO_CARRY:
6151    return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/);
6152  case ISD::USUBO_CARRY:
6153    return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/);
6154  case ISD::SADDO_CARRY:
6155    return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/);
6156  case ISD::SSUBO_CARRY:
6157    return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/);
6158  case ISD::SADDO:
6159  case ISD::UADDO:
6160  case ISD::SSUBO:
6161  case ISD::USUBO:
6162  case ISD::SMULO:
6163  case ISD::UMULO:
6164    return LowerXALUO(Op, DAG);
6165  case ISD::FADD:
6166    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
6167  case ISD::FSUB:
6168    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
6169  case ISD::FMUL:
6170    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
6171  case ISD::FMA:
6172    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
6173  case ISD::FDIV:
6174    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
6175  case ISD::FNEG:
6176    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
6177  case ISD::FCEIL:
6178    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
6179  case ISD::FFLOOR:
6180    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
6181  case ISD::FNEARBYINT:
6182    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
6183  case ISD::FRINT:
6184    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
6185  case ISD::FROUND:
6186    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
6187  case ISD::FROUNDEVEN:
6188    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
6189  case ISD::FTRUNC:
6190    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
6191  case ISD::FSQRT:
6192    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
6193  case ISD::FABS:
6194    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
6195  case ISD::FP_ROUND:
6196  case ISD::STRICT_FP_ROUND:
6197    return LowerFP_ROUND(Op, DAG);
6198  case ISD::FP_EXTEND:
6199    return LowerFP_EXTEND(Op, DAG);
6200  case ISD::FRAMEADDR:
6201    return LowerFRAMEADDR(Op, DAG);
6202  case ISD::SPONENTRY:
6203    return LowerSPONENTRY(Op, DAG);
6204  case ISD::RETURNADDR:
6205    return LowerRETURNADDR(Op, DAG);
6206  case ISD::ADDROFRETURNADDR:
6207    return LowerADDROFRETURNADDR(Op, DAG);
6208  case ISD::CONCAT_VECTORS:
6209    return LowerCONCAT_VECTORS(Op, DAG);
6210  case ISD::INSERT_VECTOR_ELT:
6211    return LowerINSERT_VECTOR_ELT(Op, DAG);
6212  case ISD::EXTRACT_VECTOR_ELT:
6213    return LowerEXTRACT_VECTOR_ELT(Op, DAG);
6214  case ISD::BUILD_VECTOR:
6215    return LowerBUILD_VECTOR(Op, DAG);
6216  case ISD::ZERO_EXTEND_VECTOR_INREG:
6217    return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
6218  case ISD::VECTOR_SHUFFLE:
6219    return LowerVECTOR_SHUFFLE(Op, DAG);
6220  case ISD::SPLAT_VECTOR:
6221    return LowerSPLAT_VECTOR(Op, DAG);
6222  case ISD::EXTRACT_SUBVECTOR:
6223    return LowerEXTRACT_SUBVECTOR(Op, DAG);
6224  case ISD::INSERT_SUBVECTOR:
6225    return LowerINSERT_SUBVECTOR(Op, DAG);
6226  case ISD::SDIV:
6227  case ISD::UDIV:
6228    return LowerDIV(Op, DAG);
6229  case ISD::SMIN:
6230  case ISD::UMIN:
6231  case ISD::SMAX:
6232  case ISD::UMAX:
6233    return LowerMinMax(Op, DAG);
6234  case ISD::SRA:
6235  case ISD::SRL:
6236  case ISD::SHL:
6237    return LowerVectorSRA_SRL_SHL(Op, DAG);
6238  case ISD::SHL_PARTS:
6239  case ISD::SRL_PARTS:
6240  case ISD::SRA_PARTS:
6241    return LowerShiftParts(Op, DAG);
6242  case ISD::CTPOP:
6243  case ISD::PARITY:
6244    return LowerCTPOP_PARITY(Op, DAG);
6245  case ISD::FCOPYSIGN:
6246    return LowerFCOPYSIGN(Op, DAG);
6247  case ISD::OR:
6248    return LowerVectorOR(Op, DAG);
6249  case ISD::XOR:
6250    return LowerXOR(Op, DAG);
6251  case ISD::PREFETCH:
6252    return LowerPREFETCH(Op, DAG);
6253  case ISD::SINT_TO_FP:
6254  case ISD::UINT_TO_FP:
6255  case ISD::STRICT_SINT_TO_FP:
6256  case ISD::STRICT_UINT_TO_FP:
6257    return LowerINT_TO_FP(Op, DAG);
6258  case ISD::FP_TO_SINT:
6259  case ISD::FP_TO_UINT:
6260  case ISD::STRICT_FP_TO_SINT:
6261  case ISD::STRICT_FP_TO_UINT:
6262    return LowerFP_TO_INT(Op, DAG);
6263  case ISD::FP_TO_SINT_SAT:
6264  case ISD::FP_TO_UINT_SAT:
6265    return LowerFP_TO_INT_SAT(Op, DAG);
6266  case ISD::FSINCOS:
6267    return LowerFSINCOS(Op, DAG);
6268  case ISD::GET_ROUNDING:
6269    return LowerGET_ROUNDING(Op, DAG);
6270  case ISD::SET_ROUNDING:
6271    return LowerSET_ROUNDING(Op, DAG);
6272  case ISD::MUL:
6273    return LowerMUL(Op, DAG);
6274  case ISD::MULHS:
6275    return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED);
6276  case ISD::MULHU:
6277    return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED);
6278  case ISD::INTRINSIC_W_CHAIN:
6279    return LowerINTRINSIC_W_CHAIN(Op, DAG);
6280  case ISD::INTRINSIC_WO_CHAIN:
6281    return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6282  case ISD::INTRINSIC_VOID:
6283    return LowerINTRINSIC_VOID(Op, DAG);
6284  case ISD::ATOMIC_STORE:
6285    if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
6286      assert(Subtarget->hasLSE2() || Subtarget->hasRCPC3());
6287      return LowerStore128(Op, DAG);
6288    }
6289    return SDValue();
6290  case ISD::STORE:
6291    return LowerSTORE(Op, DAG);
6292  case ISD::MSTORE:
6293    return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
6294  case ISD::MGATHER:
6295    return LowerMGATHER(Op, DAG);
6296  case ISD::MSCATTER:
6297    return LowerMSCATTER(Op, DAG);
6298  case ISD::VECREDUCE_SEQ_FADD:
6299    return LowerVECREDUCE_SEQ_FADD(Op, DAG);
6300  case ISD::VECREDUCE_ADD:
6301  case ISD::VECREDUCE_AND:
6302  case ISD::VECREDUCE_OR:
6303  case ISD::VECREDUCE_XOR:
6304  case ISD::VECREDUCE_SMAX:
6305  case ISD::VECREDUCE_SMIN:
6306  case ISD::VECREDUCE_UMAX:
6307  case ISD::VECREDUCE_UMIN:
6308  case ISD::VECREDUCE_FADD:
6309  case ISD::VECREDUCE_FMAX:
6310  case ISD::VECREDUCE_FMIN:
6311  case ISD::VECREDUCE_FMAXIMUM:
6312  case ISD::VECREDUCE_FMINIMUM:
6313    return LowerVECREDUCE(Op, DAG);
6314  case ISD::ATOMIC_LOAD_AND:
6315    return LowerATOMIC_LOAD_AND(Op, DAG);
6316  case ISD::DYNAMIC_STACKALLOC:
6317    return LowerDYNAMIC_STACKALLOC(Op, DAG);
6318  case ISD::VSCALE:
6319    return LowerVSCALE(Op, DAG);
6320  case ISD::ANY_EXTEND:
6321  case ISD::SIGN_EXTEND:
6322  case ISD::ZERO_EXTEND:
6323    return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
6324  case ISD::SIGN_EXTEND_INREG: {
6325    // Only custom lower when ExtraVT has a legal byte based element type.
6326    EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
6327    EVT ExtraEltVT = ExtraVT.getVectorElementType();
6328    if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
6329        (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
6330      return SDValue();
6331
6332    return LowerToPredicatedOp(Op, DAG,
6333                               AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU);
6334  }
6335  case ISD::TRUNCATE:
6336    return LowerTRUNCATE(Op, DAG);
6337  case ISD::MLOAD:
6338    return LowerMLOAD(Op, DAG);
6339  case ISD::LOAD:
6340    if (useSVEForFixedLengthVectorVT(Op.getValueType(),
6341                                     !Subtarget->isNeonAvailable()))
6342      return LowerFixedLengthVectorLoadToSVE(Op, DAG);
6343    return LowerLOAD(Op, DAG);
6344  case ISD::ADD:
6345  case ISD::AND:
6346  case ISD::SUB:
6347    return LowerToScalableOp(Op, DAG);
6348  case ISD::FMAXIMUM:
6349    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
6350  case ISD::FMAXNUM:
6351    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
6352  case ISD::FMINIMUM:
6353    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
6354  case ISD::FMINNUM:
6355    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
6356  case ISD::VSELECT:
6357    return LowerFixedLengthVectorSelectToSVE(Op, DAG);
6358  case ISD::ABS:
6359    return LowerABS(Op, DAG);
6360  case ISD::ABDS:
6361    return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
6362  case ISD::ABDU:
6363    return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
6364  case ISD::AVGFLOORS:
6365    return LowerAVG(Op, DAG, AArch64ISD::HADDS_PRED);
6366  case ISD::AVGFLOORU:
6367    return LowerAVG(Op, DAG, AArch64ISD::HADDU_PRED);
6368  case ISD::AVGCEILS:
6369    return LowerAVG(Op, DAG, AArch64ISD::RHADDS_PRED);
6370  case ISD::AVGCEILU:
6371    return LowerAVG(Op, DAG, AArch64ISD::RHADDU_PRED);
6372  case ISD::BITREVERSE:
6373    return LowerBitreverse(Op, DAG);
6374  case ISD::BSWAP:
6375    return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
6376  case ISD::CTLZ:
6377    return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU);
6378  case ISD::CTTZ:
6379    return LowerCTTZ(Op, DAG);
6380  case ISD::VECTOR_SPLICE:
6381    return LowerVECTOR_SPLICE(Op, DAG);
6382  case ISD::VECTOR_DEINTERLEAVE:
6383    return LowerVECTOR_DEINTERLEAVE(Op, DAG);
6384  case ISD::VECTOR_INTERLEAVE:
6385    return LowerVECTOR_INTERLEAVE(Op, DAG);
6386  case ISD::LROUND:
6387  case ISD::LLROUND:
6388  case ISD::LRINT:
6389  case ISD::LLRINT: {
6390    assert(Op.getOperand(0).getValueType() == MVT::f16 &&
6391           "Expected custom lowering of rounding operations only for f16");
6392    SDLoc DL(Op);
6393    SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Op.getOperand(0));
6394    return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), Ext);
6395  }
6396  case ISD::STRICT_LROUND:
6397  case ISD::STRICT_LLROUND:
6398  case ISD::STRICT_LRINT:
6399  case ISD::STRICT_LLRINT: {
6400    assert(Op.getOperand(1).getValueType() == MVT::f16 &&
6401           "Expected custom lowering of rounding operations only for f16");
6402    SDLoc DL(Op);
6403    SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
6404                              {Op.getOperand(0), Op.getOperand(1)});
6405    return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
6406                       {Ext.getValue(1), Ext.getValue(0)});
6407  }
6408  case ISD::WRITE_REGISTER: {
6409    assert(Op.getOperand(2).getValueType() == MVT::i128 &&
6410           "WRITE_REGISTER custom lowering is only for 128-bit sysregs");
6411    SDLoc DL(Op);
6412
6413    SDValue Chain = Op.getOperand(0);
6414    SDValue SysRegName = Op.getOperand(1);
6415    std::pair<SDValue, SDValue> Pair =
6416        DAG.SplitScalar(Op.getOperand(2), DL, MVT::i64, MVT::i64);
6417
6418    // chain = MSRR(chain, sysregname, lo, hi)
6419    SDValue Result = DAG.getNode(AArch64ISD::MSRR, DL, MVT::Other, Chain,
6420                                 SysRegName, Pair.first, Pair.second);
6421
6422    return Result;
6423  }
6424  case ISD::FSHL:
6425  case ISD::FSHR:
6426    return LowerFunnelShift(Op, DAG);
6427  case ISD::FLDEXP:
6428    return LowerFLDEXP(Op, DAG);
6429  }
6430}
6431
6432bool AArch64TargetLowering::mergeStoresAfterLegalization(EVT VT) const {
6433  return !Subtarget->useSVEForFixedLengthVectors();
6434}
6435
6436bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(
6437    EVT VT, bool OverrideNEON) const {
6438  if (!VT.isFixedLengthVector() || !VT.isSimple())
6439    return false;
6440
6441  // Don't use SVE for vectors we cannot scalarize if required.
6442  switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
6443  // Fixed length predicates should be promoted to i8.
6444  // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
6445  case MVT::i1:
6446  default:
6447    return false;
6448  case MVT::i8:
6449  case MVT::i16:
6450  case MVT::i32:
6451  case MVT::i64:
6452  case MVT::f16:
6453  case MVT::f32:
6454  case MVT::f64:
6455    break;
6456  }
6457
6458  // NEON-sized vectors can be emulated using SVE instructions.
6459  if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
6460    return Subtarget->hasSVEorSME();
6461
6462  // Ensure NEON MVTs only belong to a single register class.
6463  if (VT.getFixedSizeInBits() <= 128)
6464    return false;
6465
6466  // Ensure wider than NEON code generation is enabled.
6467  if (!Subtarget->useSVEForFixedLengthVectors())
6468    return false;
6469
6470  // Don't use SVE for types that don't fit.
6471  if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
6472    return false;
6473
6474  // TODO: Perhaps an artificial restriction, but worth having whilst getting
6475  // the base fixed length SVE support in place.
6476  if (!VT.isPow2VectorType())
6477    return false;
6478
6479  return true;
6480}
6481
6482//===----------------------------------------------------------------------===//
6483//                      Calling Convention Implementation
6484//===----------------------------------------------------------------------===//
6485
6486static unsigned getIntrinsicID(const SDNode *N) {
6487  unsigned Opcode = N->getOpcode();
6488  switch (Opcode) {
6489  default:
6490    return Intrinsic::not_intrinsic;
6491  case ISD::INTRINSIC_WO_CHAIN: {
6492    unsigned IID = N->getConstantOperandVal(0);
6493    if (IID < Intrinsic::num_intrinsics)
6494      return IID;
6495    return Intrinsic::not_intrinsic;
6496  }
6497  }
6498}
6499
6500bool AArch64TargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
6501                                                SDValue N1) const {
6502  if (!N0.hasOneUse())
6503    return false;
6504
6505  unsigned IID = getIntrinsicID(N1.getNode());
6506  // Avoid reassociating expressions that can be lowered to smlal/umlal.
6507  if (IID == Intrinsic::aarch64_neon_umull ||
6508      N1.getOpcode() == AArch64ISD::UMULL ||
6509      IID == Intrinsic::aarch64_neon_smull ||
6510      N1.getOpcode() == AArch64ISD::SMULL)
6511    return N0.getOpcode() != ISD::ADD;
6512
6513  return true;
6514}
6515
6516/// Selects the correct CCAssignFn for a given CallingConvention value.
6517CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
6518                                                     bool IsVarArg) const {
6519  switch (CC) {
6520  default:
6521    report_fatal_error("Unsupported calling convention.");
6522  case CallingConv::GHC:
6523    return CC_AArch64_GHC;
6524  case CallingConv::C:
6525  case CallingConv::Fast:
6526  case CallingConv::PreserveMost:
6527  case CallingConv::PreserveAll:
6528  case CallingConv::CXX_FAST_TLS:
6529  case CallingConv::Swift:
6530  case CallingConv::SwiftTail:
6531  case CallingConv::Tail:
6532  case CallingConv::GRAAL:
6533    if (Subtarget->isTargetWindows()) {
6534      if (IsVarArg) {
6535        if (Subtarget->isWindowsArm64EC())
6536          return CC_AArch64_Arm64EC_VarArg;
6537        return CC_AArch64_Win64_VarArg;
6538      }
6539      return CC_AArch64_Win64PCS;
6540    }
6541    if (!Subtarget->isTargetDarwin())
6542      return CC_AArch64_AAPCS;
6543    if (!IsVarArg)
6544      return CC_AArch64_DarwinPCS;
6545    return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg
6546                                      : CC_AArch64_DarwinPCS_VarArg;
6547   case CallingConv::Win64:
6548     if (IsVarArg) {
6549       if (Subtarget->isWindowsArm64EC())
6550         return CC_AArch64_Arm64EC_VarArg;
6551       return CC_AArch64_Win64_VarArg;
6552     }
6553     return CC_AArch64_Win64PCS;
6554   case CallingConv::CFGuard_Check:
6555     if (Subtarget->isWindowsArm64EC())
6556       return CC_AArch64_Arm64EC_CFGuard_Check;
6557     return CC_AArch64_Win64_CFGuard_Check;
6558   case CallingConv::AArch64_VectorCall:
6559   case CallingConv::AArch64_SVE_VectorCall:
6560   case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0:
6561   case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2:
6562     return CC_AArch64_AAPCS;
6563  case CallingConv::ARM64EC_Thunk_X64:
6564    return CC_AArch64_Arm64EC_Thunk;
6565  case CallingConv::ARM64EC_Thunk_Native:
6566    return CC_AArch64_Arm64EC_Thunk_Native;
6567  }
6568}
6569
6570CCAssignFn *
6571AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
6572  switch (CC) {
6573  default:
6574    return RetCC_AArch64_AAPCS;
6575  case CallingConv::ARM64EC_Thunk_X64:
6576    return RetCC_AArch64_Arm64EC_Thunk;
6577  case CallingConv::CFGuard_Check:
6578    if (Subtarget->isWindowsArm64EC())
6579      return RetCC_AArch64_Arm64EC_CFGuard_Check;
6580    return RetCC_AArch64_AAPCS;
6581  }
6582}
6583
6584
6585unsigned
6586AArch64TargetLowering::allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL,
6587                                              SelectionDAG &DAG) const {
6588  MachineFunction &MF = DAG.getMachineFunction();
6589  MachineFrameInfo &MFI = MF.getFrameInfo();
6590
6591  // Allocate a lazy-save buffer object of size SVL.B * SVL.B (worst-case)
6592  SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
6593                          DAG.getConstant(1, DL, MVT::i32));
6594  SDValue NN = DAG.getNode(ISD::MUL, DL, MVT::i64, N, N);
6595  SDValue Ops[] = {Chain, NN, DAG.getConstant(1, DL, MVT::i64)};
6596  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other);
6597  SDValue Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL, VTs, Ops);
6598  Chain = Buffer.getValue(1);
6599  MFI.CreateVariableSizedObject(Align(1), nullptr);
6600
6601  // Allocate an additional TPIDR2 object on the stack (16 bytes)
6602  unsigned TPIDR2Obj = MFI.CreateStackObject(16, Align(16), false);
6603
6604  // Store the buffer pointer to the TPIDR2 stack object.
6605  MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj);
6606  SDValue Ptr = DAG.getFrameIndex(
6607      TPIDR2Obj,
6608      DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
6609  Chain = DAG.getStore(Chain, DL, Buffer, Ptr, MPI);
6610
6611  // Set the reserved bytes (10-15) to zero
6612  EVT PtrTy = Ptr.getValueType();
6613  SDValue ReservedPtr =
6614      DAG.getNode(ISD::ADD, DL, PtrTy, Ptr, DAG.getConstant(10, DL, PtrTy));
6615  Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i16), ReservedPtr,
6616                       MPI);
6617  ReservedPtr =
6618      DAG.getNode(ISD::ADD, DL, PtrTy, Ptr, DAG.getConstant(12, DL, PtrTy));
6619  Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i32), ReservedPtr,
6620                       MPI);
6621
6622  return TPIDR2Obj;
6623}
6624
6625SDValue AArch64TargetLowering::LowerFormalArguments(
6626    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
6627    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
6628    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
6629  MachineFunction &MF = DAG.getMachineFunction();
6630  const Function &F = MF.getFunction();
6631  MachineFrameInfo &MFI = MF.getFrameInfo();
6632  bool IsWin64 = Subtarget->isCallingConvWin64(F.getCallingConv());
6633  bool StackViaX4 = CallConv == CallingConv::ARM64EC_Thunk_X64 ||
6634                    (isVarArg && Subtarget->isWindowsArm64EC());
6635  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
6636
6637  SmallVector<ISD::OutputArg, 4> Outs;
6638  GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
6639                DAG.getTargetLoweringInfo(), MF.getDataLayout());
6640  if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
6641    FuncInfo->setIsSVECC(true);
6642
6643  // Assign locations to all of the incoming arguments.
6644  SmallVector<CCValAssign, 16> ArgLocs;
6645  DenseMap<unsigned, SDValue> CopiedRegs;
6646  CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
6647
6648  // At this point, Ins[].VT may already be promoted to i32. To correctly
6649  // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
6650  // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
6651  // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
6652  // we use a special version of AnalyzeFormalArguments to pass in ValVT and
6653  // LocVT.
6654  unsigned NumArgs = Ins.size();
6655  Function::const_arg_iterator CurOrigArg = F.arg_begin();
6656  unsigned CurArgIdx = 0;
6657  for (unsigned i = 0; i != NumArgs; ++i) {
6658    MVT ValVT = Ins[i].VT;
6659    if (Ins[i].isOrigArg()) {
6660      std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
6661      CurArgIdx = Ins[i].getOrigArgIndex();
6662
6663      // Get type of the original argument.
6664      EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
6665                                  /*AllowUnknown*/ true);
6666      MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
6667      // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
6668      if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
6669        ValVT = MVT::i8;
6670      else if (ActualMVT == MVT::i16)
6671        ValVT = MVT::i16;
6672    }
6673    bool UseVarArgCC = false;
6674    if (IsWin64)
6675      UseVarArgCC = isVarArg;
6676    CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
6677    bool Res =
6678        AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
6679    assert(!Res && "Call operand has unhandled type");
6680    (void)Res;
6681  }
6682
6683  SMEAttrs Attrs(MF.getFunction());
6684  bool IsLocallyStreaming =
6685      !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
6686  assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
6687  SDValue Glue = Chain.getValue(1);
6688
6689  SmallVector<SDValue, 16> ArgValues;
6690  unsigned ExtraArgLocs = 0;
6691  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
6692    CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
6693
6694    if (Ins[i].Flags.isByVal()) {
6695      // Byval is used for HFAs in the PCS, but the system should work in a
6696      // non-compliant manner for larger structs.
6697      EVT PtrVT = getPointerTy(DAG.getDataLayout());
6698      int Size = Ins[i].Flags.getByValSize();
6699      unsigned NumRegs = (Size + 7) / 8;
6700
6701      // FIXME: This works on big-endian for composite byvals, which are the common
6702      // case. It should also work for fundamental types too.
6703      unsigned FrameIdx =
6704        MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
6705      SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
6706      InVals.push_back(FrameIdxN);
6707
6708      continue;
6709    }
6710
6711    if (Ins[i].Flags.isSwiftAsync())
6712      MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
6713
6714    SDValue ArgValue;
6715    if (VA.isRegLoc()) {
6716      // Arguments stored in registers.
6717      EVT RegVT = VA.getLocVT();
6718      const TargetRegisterClass *RC;
6719
6720      if (RegVT == MVT::i32)
6721        RC = &AArch64::GPR32RegClass;
6722      else if (RegVT == MVT::i64)
6723        RC = &AArch64::GPR64RegClass;
6724      else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
6725        RC = &AArch64::FPR16RegClass;
6726      else if (RegVT == MVT::f32)
6727        RC = &AArch64::FPR32RegClass;
6728      else if (RegVT == MVT::f64 || RegVT.is64BitVector())
6729        RC = &AArch64::FPR64RegClass;
6730      else if (RegVT == MVT::f128 || RegVT.is128BitVector())
6731        RC = &AArch64::FPR128RegClass;
6732      else if (RegVT.isScalableVector() &&
6733               RegVT.getVectorElementType() == MVT::i1) {
6734        FuncInfo->setIsSVECC(true);
6735        RC = &AArch64::PPRRegClass;
6736      } else if (RegVT == MVT::aarch64svcount) {
6737        FuncInfo->setIsSVECC(true);
6738        RC = &AArch64::PPRRegClass;
6739      } else if (RegVT.isScalableVector()) {
6740        FuncInfo->setIsSVECC(true);
6741        RC = &AArch64::ZPRRegClass;
6742      } else
6743        llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
6744
6745      // Transform the arguments in physical registers into virtual ones.
6746      Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
6747
6748      if (IsLocallyStreaming) {
6749        // LocallyStreamingFunctions must insert the SMSTART in the correct
6750        // position, so we use Glue to ensure no instructions can be scheduled
6751        // between the chain of:
6752        //        t0: ch,glue = EntryNode
6753        //      t1:  res,ch,glue = CopyFromReg
6754        //     ...
6755        //   tn: res,ch,glue = CopyFromReg t(n-1), ..
6756        // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
6757        // ^^^^^^
6758        // This will be the new Chain/Root node.
6759        ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue);
6760        Glue = ArgValue.getValue(2);
6761      } else
6762        ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
6763
6764      // If this is an 8, 16 or 32-bit value, it is really passed promoted
6765      // to 64 bits.  Insert an assert[sz]ext to capture this, then
6766      // truncate to the right size.
6767      switch (VA.getLocInfo()) {
6768      default:
6769        llvm_unreachable("Unknown loc info!");
6770      case CCValAssign::Full:
6771        break;
6772      case CCValAssign::Indirect:
6773        assert(
6774            (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
6775            "Indirect arguments should be scalable on most subtargets");
6776        break;
6777      case CCValAssign::BCvt:
6778        ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
6779        break;
6780      case CCValAssign::AExt:
6781      case CCValAssign::SExt:
6782      case CCValAssign::ZExt:
6783        break;
6784      case CCValAssign::AExtUpper:
6785        ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
6786                               DAG.getConstant(32, DL, RegVT));
6787        ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
6788        break;
6789      }
6790    } else { // VA.isRegLoc()
6791      assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
6792      unsigned ArgOffset = VA.getLocMemOffset();
6793      unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
6794                              ? VA.getLocVT().getSizeInBits()
6795                              : VA.getValVT().getSizeInBits()) / 8;
6796
6797      uint32_t BEAlign = 0;
6798      if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
6799          !Ins[i].Flags.isInConsecutiveRegs())
6800        BEAlign = 8 - ArgSize;
6801
6802      SDValue FIN;
6803      MachinePointerInfo PtrInfo;
6804      if (StackViaX4) {
6805        // In both the ARM64EC varargs convention and the thunk convention,
6806        // arguments on the stack are accessed relative to x4, not sp. In
6807        // the thunk convention, there's an additional offset of 32 bytes
6808        // to account for the shadow store.
6809        unsigned ObjOffset = ArgOffset + BEAlign;
6810        if (CallConv == CallingConv::ARM64EC_Thunk_X64)
6811          ObjOffset += 32;
6812        Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
6813        SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
6814        FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
6815                          DAG.getConstant(ObjOffset, DL, MVT::i64));
6816        PtrInfo = MachinePointerInfo::getUnknownStack(MF);
6817      } else {
6818        int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
6819
6820        // Create load nodes to retrieve arguments from the stack.
6821        FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
6822        PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
6823      }
6824
6825      // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
6826      ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
6827      MVT MemVT = VA.getValVT();
6828
6829      switch (VA.getLocInfo()) {
6830      default:
6831        break;
6832      case CCValAssign::Trunc:
6833      case CCValAssign::BCvt:
6834        MemVT = VA.getLocVT();
6835        break;
6836      case CCValAssign::Indirect:
6837        assert((VA.getValVT().isScalableVector() ||
6838                Subtarget->isWindowsArm64EC()) &&
6839               "Indirect arguments should be scalable on most subtargets");
6840        MemVT = VA.getLocVT();
6841        break;
6842      case CCValAssign::SExt:
6843        ExtType = ISD::SEXTLOAD;
6844        break;
6845      case CCValAssign::ZExt:
6846        ExtType = ISD::ZEXTLOAD;
6847        break;
6848      case CCValAssign::AExt:
6849        ExtType = ISD::EXTLOAD;
6850        break;
6851      }
6852
6853      ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo,
6854                                MemVT);
6855    }
6856
6857    if (VA.getLocInfo() == CCValAssign::Indirect) {
6858      assert((VA.getValVT().isScalableVT() ||
6859              Subtarget->isWindowsArm64EC()) &&
6860             "Indirect arguments should be scalable on most subtargets");
6861
6862      uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
6863      unsigned NumParts = 1;
6864      if (Ins[i].Flags.isInConsecutiveRegs()) {
6865        assert(!Ins[i].Flags.isInConsecutiveRegsLast());
6866        while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
6867          ++NumParts;
6868      }
6869
6870      MVT PartLoad = VA.getValVT();
6871      SDValue Ptr = ArgValue;
6872
6873      // Ensure we generate all loads for each tuple part, whilst updating the
6874      // pointer after each load correctly using vscale.
6875      while (NumParts > 0) {
6876        ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
6877        InVals.push_back(ArgValue);
6878        NumParts--;
6879        if (NumParts > 0) {
6880          SDValue BytesIncrement;
6881          if (PartLoad.isScalableVector()) {
6882            BytesIncrement = DAG.getVScale(
6883                DL, Ptr.getValueType(),
6884                APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
6885          } else {
6886            BytesIncrement = DAG.getConstant(
6887                APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
6888                Ptr.getValueType());
6889          }
6890          SDNodeFlags Flags;
6891          Flags.setNoUnsignedWrap(true);
6892          Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6893                            BytesIncrement, Flags);
6894          ExtraArgLocs++;
6895          i++;
6896        }
6897      }
6898    } else {
6899      if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
6900        ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
6901                               ArgValue, DAG.getValueType(MVT::i32));
6902
6903      // i1 arguments are zero-extended to i8 by the caller. Emit a
6904      // hint to reflect this.
6905      if (Ins[i].isOrigArg()) {
6906        Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex());
6907        if (OrigArg->getType()->isIntegerTy(1)) {
6908          if (!Ins[i].Flags.isZExt()) {
6909            ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,
6910                                   ArgValue.getValueType(), ArgValue);
6911          }
6912        }
6913      }
6914
6915      InVals.push_back(ArgValue);
6916    }
6917  }
6918  assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
6919
6920  // Insert the SMSTART if this is a locally streaming function and
6921  // make sure it is Glued to the last CopyFromReg value.
6922  if (IsLocallyStreaming) {
6923    SDValue PStateSM;
6924    if (Attrs.hasStreamingCompatibleInterface()) {
6925      PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
6926      Register Reg = MF.getRegInfo().createVirtualRegister(
6927          getRegClassFor(PStateSM.getValueType().getSimpleVT()));
6928      FuncInfo->setPStateSMReg(Reg);
6929      Chain = DAG.getCopyToReg(Chain, DL, Reg, PStateSM);
6930    } else {
6931      PStateSM = DAG.getConstant(0, DL, MVT::i64);
6932    }
6933    Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue, PStateSM,
6934                                /*Entry*/ true);
6935
6936    // Ensure that the SMSTART happens after the CopyWithChain such that its
6937    // chain result is used.
6938    for (unsigned I=0; I<InVals.size(); ++I) {
6939      Register Reg = MF.getRegInfo().createVirtualRegister(
6940          getRegClassFor(InVals[I].getValueType().getSimpleVT()));
6941      Chain = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]);
6942      InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg,
6943                                     InVals[I].getValueType());
6944    }
6945  }
6946
6947  // varargs
6948  if (isVarArg) {
6949    if (!Subtarget->isTargetDarwin() || IsWin64) {
6950      // The AAPCS variadic function ABI is identical to the non-variadic
6951      // one. As a result there may be more arguments in registers and we should
6952      // save them for future reference.
6953      // Win64 variadic functions also pass arguments in registers, but all float
6954      // arguments are passed in integer registers.
6955      saveVarArgRegisters(CCInfo, DAG, DL, Chain);
6956    }
6957
6958    // This will point to the next argument passed via stack.
6959    unsigned VarArgsOffset = CCInfo.getStackSize();
6960    // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
6961    VarArgsOffset = alignTo(VarArgsOffset, Subtarget->isTargetILP32() ? 4 : 8);
6962    FuncInfo->setVarArgsStackOffset(VarArgsOffset);
6963    FuncInfo->setVarArgsStackIndex(
6964        MFI.CreateFixedObject(4, VarArgsOffset, true));
6965
6966    if (MFI.hasMustTailInVarArgFunc()) {
6967      SmallVector<MVT, 2> RegParmTypes;
6968      RegParmTypes.push_back(MVT::i64);
6969      RegParmTypes.push_back(MVT::f128);
6970      // Compute the set of forwarded registers. The rest are scratch.
6971      SmallVectorImpl<ForwardedRegister> &Forwards =
6972                                       FuncInfo->getForwardedMustTailRegParms();
6973      CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
6974                                               CC_AArch64_AAPCS);
6975
6976      // Conservatively forward X8, since it might be used for aggregate return.
6977      if (!CCInfo.isAllocated(AArch64::X8)) {
6978        Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
6979        Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
6980      }
6981    }
6982  }
6983
6984  // On Windows, InReg pointers must be returned, so record the pointer in a
6985  // virtual register at the start of the function so it can be returned in the
6986  // epilogue.
6987  if (IsWin64 || F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64) {
6988    for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
6989      if ((F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64 ||
6990           Ins[I].Flags.isInReg()) &&
6991          Ins[I].Flags.isSRet()) {
6992        assert(!FuncInfo->getSRetReturnReg());
6993
6994        MVT PtrTy = getPointerTy(DAG.getDataLayout());
6995        Register Reg =
6996            MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
6997        FuncInfo->setSRetReturnReg(Reg);
6998
6999        SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
7000        Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
7001        break;
7002      }
7003    }
7004  }
7005
7006  unsigned StackArgSize = CCInfo.getStackSize();
7007  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
7008  if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
7009    // This is a non-standard ABI so by fiat I say we're allowed to make full
7010    // use of the stack area to be popped, which must be aligned to 16 bytes in
7011    // any case:
7012    StackArgSize = alignTo(StackArgSize, 16);
7013
7014    // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
7015    // a multiple of 16.
7016    FuncInfo->setArgumentStackToRestore(StackArgSize);
7017
7018    // This realignment carries over to the available bytes below. Our own
7019    // callers will guarantee the space is free by giving an aligned value to
7020    // CALLSEQ_START.
7021  }
7022  // Even if we're not expected to free up the space, it's useful to know how
7023  // much is there while considering tail calls (because we can reuse it).
7024  FuncInfo->setBytesInStackArgArea(StackArgSize);
7025
7026  if (Subtarget->hasCustomCallingConv())
7027    Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
7028
7029  // Conservatively assume the function requires the lazy-save mechanism.
7030  if (SMEAttrs(MF.getFunction()).hasZAState()) {
7031    unsigned TPIDR2Obj = allocateLazySaveBuffer(Chain, DL, DAG);
7032    FuncInfo->setLazySaveTPIDR2Obj(TPIDR2Obj);
7033  }
7034
7035  return Chain;
7036}
7037
7038void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
7039                                                SelectionDAG &DAG,
7040                                                const SDLoc &DL,
7041                                                SDValue &Chain) const {
7042  MachineFunction &MF = DAG.getMachineFunction();
7043  MachineFrameInfo &MFI = MF.getFrameInfo();
7044  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
7045  auto PtrVT = getPointerTy(DAG.getDataLayout());
7046  bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
7047
7048  SmallVector<SDValue, 8> MemOps;
7049
7050  auto GPRArgRegs = AArch64::getGPRArgRegs();
7051  unsigned NumGPRArgRegs = GPRArgRegs.size();
7052  if (Subtarget->isWindowsArm64EC()) {
7053    // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
7054    // functions.
7055    NumGPRArgRegs = 4;
7056  }
7057  unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
7058
7059  unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
7060  int GPRIdx = 0;
7061  if (GPRSaveSize != 0) {
7062    if (IsWin64) {
7063      GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
7064      if (GPRSaveSize & 15)
7065        // The extra size here, if triggered, will always be 8.
7066        MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
7067    } else
7068      GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
7069
7070    SDValue FIN;
7071    if (Subtarget->isWindowsArm64EC()) {
7072      // With the Arm64EC ABI, we reserve the save area as usual, but we
7073      // compute its address relative to x4.  For a normal AArch64->AArch64
7074      // call, x4 == sp on entry, but calls from an entry thunk can pass in a
7075      // different address.
7076      Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
7077      SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7078      FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val,
7079                        DAG.getConstant(GPRSaveSize, DL, MVT::i64));
7080    } else {
7081      FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
7082    }
7083
7084    for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
7085      Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
7086      SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7087      SDValue Store =
7088          DAG.getStore(Val.getValue(1), DL, Val, FIN,
7089                       IsWin64 ? MachinePointerInfo::getFixedStack(
7090                                     MF, GPRIdx, (i - FirstVariadicGPR) * 8)
7091                               : MachinePointerInfo::getStack(MF, i * 8));
7092      MemOps.push_back(Store);
7093      FIN =
7094          DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
7095    }
7096  }
7097  FuncInfo->setVarArgsGPRIndex(GPRIdx);
7098  FuncInfo->setVarArgsGPRSize(GPRSaveSize);
7099
7100  if (Subtarget->hasFPARMv8() && !IsWin64) {
7101    auto FPRArgRegs = AArch64::getFPRArgRegs();
7102    const unsigned NumFPRArgRegs = FPRArgRegs.size();
7103    unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
7104
7105    unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
7106    int FPRIdx = 0;
7107    if (FPRSaveSize != 0) {
7108      FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
7109
7110      SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
7111
7112      for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
7113        Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
7114        SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
7115
7116        SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
7117                                     MachinePointerInfo::getStack(MF, i * 16));
7118        MemOps.push_back(Store);
7119        FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
7120                          DAG.getConstant(16, DL, PtrVT));
7121      }
7122    }
7123    FuncInfo->setVarArgsFPRIndex(FPRIdx);
7124    FuncInfo->setVarArgsFPRSize(FPRSaveSize);
7125  }
7126
7127  if (!MemOps.empty()) {
7128    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
7129  }
7130}
7131
7132static bool isPassedInFPR(EVT VT) {
7133  return VT.isFixedLengthVector() ||
7134         (VT.isFloatingPoint() && !VT.isScalableVector());
7135}
7136
7137/// LowerCallResult - Lower the result values of a call into the
7138/// appropriate copies out of appropriate physical registers.
7139SDValue AArch64TargetLowering::LowerCallResult(
7140    SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
7141    const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
7142    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
7143    SDValue ThisVal, bool RequiresSMChange) const {
7144  DenseMap<unsigned, SDValue> CopiedRegs;
7145  // Copy all of the result registers out of their specified physreg.
7146  for (unsigned i = 0; i != RVLocs.size(); ++i) {
7147    CCValAssign VA = RVLocs[i];
7148
7149    // Pass 'this' value directly from the argument to return value, to avoid
7150    // reg unit interference
7151    if (i == 0 && isThisReturn) {
7152      assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
7153             "unexpected return calling convention register assignment");
7154      InVals.push_back(ThisVal);
7155      continue;
7156    }
7157
7158    // Avoid copying a physreg twice since RegAllocFast is incompetent and only
7159    // allows one use of a physreg per block.
7160    SDValue Val = CopiedRegs.lookup(VA.getLocReg());
7161    if (!Val) {
7162      Val =
7163          DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
7164      Chain = Val.getValue(1);
7165      InGlue = Val.getValue(2);
7166      CopiedRegs[VA.getLocReg()] = Val;
7167    }
7168
7169    switch (VA.getLocInfo()) {
7170    default:
7171      llvm_unreachable("Unknown loc info!");
7172    case CCValAssign::Full:
7173      break;
7174    case CCValAssign::BCvt:
7175      Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
7176      break;
7177    case CCValAssign::AExtUpper:
7178      Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
7179                        DAG.getConstant(32, DL, VA.getLocVT()));
7180      [[fallthrough]];
7181    case CCValAssign::AExt:
7182      [[fallthrough]];
7183    case CCValAssign::ZExt:
7184      Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
7185      break;
7186    }
7187
7188    if (RequiresSMChange && isPassedInFPR(VA.getValVT()))
7189      Val = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL, Val.getValueType(),
7190                        Val);
7191
7192    InVals.push_back(Val);
7193  }
7194
7195  return Chain;
7196}
7197
7198/// Return true if the calling convention is one that we can guarantee TCO for.
7199static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
7200  return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
7201         CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
7202}
7203
7204/// Return true if we might ever do TCO for calls with this calling convention.
7205static bool mayTailCallThisCC(CallingConv::ID CC) {
7206  switch (CC) {
7207  case CallingConv::C:
7208  case CallingConv::AArch64_SVE_VectorCall:
7209  case CallingConv::PreserveMost:
7210  case CallingConv::PreserveAll:
7211  case CallingConv::Swift:
7212  case CallingConv::SwiftTail:
7213  case CallingConv::Tail:
7214  case CallingConv::Fast:
7215    return true;
7216  default:
7217    return false;
7218  }
7219}
7220
7221static void analyzeCallOperands(const AArch64TargetLowering &TLI,
7222                                const AArch64Subtarget *Subtarget,
7223                                const TargetLowering::CallLoweringInfo &CLI,
7224                                CCState &CCInfo) {
7225  const SelectionDAG &DAG = CLI.DAG;
7226  CallingConv::ID CalleeCC = CLI.CallConv;
7227  bool IsVarArg = CLI.IsVarArg;
7228  const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7229  bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
7230
7231  // For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack
7232  // for the shadow store.
7233  if (CalleeCC == CallingConv::ARM64EC_Thunk_X64)
7234    CCInfo.AllocateStack(32, Align(16));
7235
7236  unsigned NumArgs = Outs.size();
7237  for (unsigned i = 0; i != NumArgs; ++i) {
7238    MVT ArgVT = Outs[i].VT;
7239    ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
7240
7241    bool UseVarArgCC = false;
7242    if (IsVarArg) {
7243      // On Windows, the fixed arguments in a vararg call are passed in GPRs
7244      // too, so use the vararg CC to force them to integer registers.
7245      if (IsCalleeWin64) {
7246        UseVarArgCC = true;
7247      } else {
7248        UseVarArgCC = !Outs[i].IsFixed;
7249      }
7250    }
7251
7252    if (!UseVarArgCC) {
7253      // Get type of the original argument.
7254      EVT ActualVT =
7255          TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,
7256                       /*AllowUnknown*/ true);
7257      MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
7258      // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
7259      if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
7260        ArgVT = MVT::i8;
7261      else if (ActualMVT == MVT::i16)
7262        ArgVT = MVT::i16;
7263    }
7264
7265    CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
7266    bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
7267    assert(!Res && "Call operand has unhandled type");
7268    (void)Res;
7269  }
7270}
7271
7272bool AArch64TargetLowering::isEligibleForTailCallOptimization(
7273    const CallLoweringInfo &CLI) const {
7274  CallingConv::ID CalleeCC = CLI.CallConv;
7275  if (!mayTailCallThisCC(CalleeCC))
7276    return false;
7277
7278  SDValue Callee = CLI.Callee;
7279  bool IsVarArg = CLI.IsVarArg;
7280  const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7281  const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
7282  const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
7283  const SelectionDAG &DAG = CLI.DAG;
7284  MachineFunction &MF = DAG.getMachineFunction();
7285  const Function &CallerF = MF.getFunction();
7286  CallingConv::ID CallerCC = CallerF.getCallingConv();
7287
7288  // SME Streaming functions are not eligible for TCO as they may require
7289  // the streaming mode or ZA to be restored after returning from the call.
7290  SMEAttrs CallerAttrs(MF.getFunction());
7291  auto CalleeAttrs = CLI.CB ? SMEAttrs(*CLI.CB) : SMEAttrs(SMEAttrs::Normal);
7292  if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
7293      CallerAttrs.requiresLazySave(CalleeAttrs) ||
7294      CallerAttrs.hasStreamingBody())
7295    return false;
7296
7297  // Functions using the C or Fast calling convention that have an SVE signature
7298  // preserve more registers and should assume the SVE_VectorCall CC.
7299  // The check for matching callee-saved regs will determine whether it is
7300  // eligible for TCO.
7301  if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
7302      MF.getInfo<AArch64FunctionInfo>()->isSVECC())
7303    CallerCC = CallingConv::AArch64_SVE_VectorCall;
7304
7305  bool CCMatch = CallerCC == CalleeCC;
7306
7307  // When using the Windows calling convention on a non-windows OS, we want
7308  // to back up and restore X18 in such functions; we can't do a tail call
7309  // from those functions.
7310  if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
7311      CalleeCC != CallingConv::Win64)
7312    return false;
7313
7314  // Byval parameters hand the function a pointer directly into the stack area
7315  // we want to reuse during a tail call. Working around this *is* possible (see
7316  // X86) but less efficient and uglier in LowerCall.
7317  for (Function::const_arg_iterator i = CallerF.arg_begin(),
7318                                    e = CallerF.arg_end();
7319       i != e; ++i) {
7320    if (i->hasByValAttr())
7321      return false;
7322
7323    // On Windows, "inreg" attributes signify non-aggregate indirect returns.
7324    // In this case, it is necessary to save/restore X0 in the callee. Tail
7325    // call opt interferes with this. So we disable tail call opt when the
7326    // caller has an argument with "inreg" attribute.
7327
7328    // FIXME: Check whether the callee also has an "inreg" argument.
7329    if (i->hasInRegAttr())
7330      return false;
7331  }
7332
7333  if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
7334    return CCMatch;
7335
7336  // Externally-defined functions with weak linkage should not be
7337  // tail-called on AArch64 when the OS does not support dynamic
7338  // pre-emption of symbols, as the AAELF spec requires normal calls
7339  // to undefined weak functions to be replaced with a NOP or jump to the
7340  // next instruction. The behaviour of branch instructions in this
7341  // situation (as used for tail calls) is implementation-defined, so we
7342  // cannot rely on the linker replacing the tail call with a return.
7343  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
7344    const GlobalValue *GV = G->getGlobal();
7345    const Triple &TT = getTargetMachine().getTargetTriple();
7346    if (GV->hasExternalWeakLinkage() &&
7347        (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
7348      return false;
7349  }
7350
7351  // Now we search for cases where we can use a tail call without changing the
7352  // ABI. Sibcall is used in some places (particularly gcc) to refer to this
7353  // concept.
7354
7355  // I want anyone implementing a new calling convention to think long and hard
7356  // about this assert.
7357  assert((!IsVarArg || CalleeCC == CallingConv::C) &&
7358         "Unexpected variadic calling convention");
7359
7360  LLVMContext &C = *DAG.getContext();
7361  // Check that the call results are passed in the same way.
7362  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
7363                                  CCAssignFnForCall(CalleeCC, IsVarArg),
7364                                  CCAssignFnForCall(CallerCC, IsVarArg)))
7365    return false;
7366  // The callee has to preserve all registers the caller needs to preserve.
7367  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
7368  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
7369  if (!CCMatch) {
7370    const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
7371    if (Subtarget->hasCustomCallingConv()) {
7372      TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
7373      TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
7374    }
7375    if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
7376      return false;
7377  }
7378
7379  // Nothing more to check if the callee is taking no arguments
7380  if (Outs.empty())
7381    return true;
7382
7383  SmallVector<CCValAssign, 16> ArgLocs;
7384  CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
7385
7386  analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
7387
7388  if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
7389    // When we are musttail, additional checks have been done and we can safely ignore this check
7390    // At least two cases here: if caller is fastcc then we can't have any
7391    // memory arguments (we'd be expected to clean up the stack afterwards). If
7392    // caller is C then we could potentially use its argument area.
7393
7394    // FIXME: for now we take the most conservative of these in both cases:
7395    // disallow all variadic memory operands.
7396    for (const CCValAssign &ArgLoc : ArgLocs)
7397      if (!ArgLoc.isRegLoc())
7398        return false;
7399  }
7400
7401  const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
7402
7403  // If any of the arguments is passed indirectly, it must be SVE, so the
7404  // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
7405  // allocate space on the stack. That is why we determine this explicitly here
7406  // the call cannot be a tailcall.
7407  if (llvm::any_of(ArgLocs, [&](CCValAssign &A) {
7408        assert((A.getLocInfo() != CCValAssign::Indirect ||
7409                A.getValVT().isScalableVector() ||
7410                Subtarget->isWindowsArm64EC()) &&
7411               "Expected value to be scalable");
7412        return A.getLocInfo() == CCValAssign::Indirect;
7413      }))
7414    return false;
7415
7416  // If the stack arguments for this call do not fit into our own save area then
7417  // the call cannot be made tail.
7418  if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
7419    return false;
7420
7421  const MachineRegisterInfo &MRI = MF.getRegInfo();
7422  if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
7423    return false;
7424
7425  return true;
7426}
7427
7428SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
7429                                                   SelectionDAG &DAG,
7430                                                   MachineFrameInfo &MFI,
7431                                                   int ClobberedFI) const {
7432  SmallVector<SDValue, 8> ArgChains;
7433  int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
7434  int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
7435
7436  // Include the original chain at the beginning of the list. When this is
7437  // used by target LowerCall hooks, this helps legalize find the
7438  // CALLSEQ_BEGIN node.
7439  ArgChains.push_back(Chain);
7440
7441  // Add a chain value for each stack argument corresponding
7442  for (SDNode *U : DAG.getEntryNode().getNode()->uses())
7443    if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
7444      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
7445        if (FI->getIndex() < 0) {
7446          int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
7447          int64_t InLastByte = InFirstByte;
7448          InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
7449
7450          if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
7451              (FirstByte <= InFirstByte && InFirstByte <= LastByte))
7452            ArgChains.push_back(SDValue(L, 1));
7453        }
7454
7455  // Build a tokenfactor for all the chains.
7456  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
7457}
7458
7459bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
7460                                                   bool TailCallOpt) const {
7461  return (CallCC == CallingConv::Fast && TailCallOpt) ||
7462         CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
7463}
7464
7465// Check if the value is zero-extended from i1 to i8
7466static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
7467  unsigned SizeInBits = Arg.getValueType().getSizeInBits();
7468  if (SizeInBits < 8)
7469    return false;
7470
7471  APInt RequredZero(SizeInBits, 0xFE);
7472  KnownBits Bits = DAG.computeKnownBits(Arg, 4);
7473  bool ZExtBool = (Bits.Zero & RequredZero) == RequredZero;
7474  return ZExtBool;
7475}
7476
7477void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
7478                                                          SDNode *Node) const {
7479  // Live-in physreg copies that are glued to SMSTART are applied as
7480  // implicit-def's in the InstrEmitter. Here we remove them, allowing the
7481  // register allocator to pass call args in callee saved regs, without extra
7482  // copies to avoid these fake clobbers of actually-preserved GPRs.
7483  if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||
7484      MI.getOpcode() == AArch64::MSRpstatePseudo)
7485    for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
7486      if (MachineOperand &MO = MI.getOperand(I);
7487          MO.isReg() && MO.isImplicit() && MO.isDef() &&
7488          (AArch64::GPR32RegClass.contains(MO.getReg()) ||
7489           AArch64::GPR64RegClass.contains(MO.getReg())))
7490        MI.removeOperand(I);
7491}
7492
7493SDValue AArch64TargetLowering::changeStreamingMode(
7494    SelectionDAG &DAG, SDLoc DL, bool Enable,
7495    SDValue Chain, SDValue InGlue, SDValue PStateSM, bool Entry) const {
7496  MachineFunction &MF = DAG.getMachineFunction();
7497  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
7498  FuncInfo->setHasStreamingModeChanges(true);
7499
7500  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
7501  SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
7502  SDValue MSROp =
7503      DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32);
7504
7505  SDValue ExpectedSMVal =
7506      DAG.getTargetConstant(Entry ? Enable : !Enable, DL, MVT::i64);
7507  SmallVector<SDValue> Ops = {Chain, MSROp, PStateSM, ExpectedSMVal, RegMask};
7508
7509  if (InGlue)
7510    Ops.push_back(InGlue);
7511
7512  unsigned Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
7513  return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
7514}
7515
7516/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
7517/// and add input and output parameter nodes.
7518SDValue
7519AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
7520                                 SmallVectorImpl<SDValue> &InVals) const {
7521  SelectionDAG &DAG = CLI.DAG;
7522  SDLoc &DL = CLI.DL;
7523  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7524  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
7525  SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
7526  SDValue Chain = CLI.Chain;
7527  SDValue Callee = CLI.Callee;
7528  bool &IsTailCall = CLI.IsTailCall;
7529  CallingConv::ID &CallConv = CLI.CallConv;
7530  bool IsVarArg = CLI.IsVarArg;
7531
7532  MachineFunction &MF = DAG.getMachineFunction();
7533  MachineFunction::CallSiteInfo CSInfo;
7534  bool IsThisReturn = false;
7535
7536  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
7537  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
7538  bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
7539  bool IsSibCall = false;
7540  bool GuardWithBTI = false;
7541
7542  if (CLI.CB && CLI.CB->hasFnAttr(Attribute::ReturnsTwice) &&
7543      !Subtarget->noBTIAtReturnTwice()) {
7544    GuardWithBTI = FuncInfo->branchTargetEnforcement();
7545  }
7546
7547  // Analyze operands of the call, assigning locations to each operand.
7548  SmallVector<CCValAssign, 16> ArgLocs;
7549  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
7550
7551  if (IsVarArg) {
7552    unsigned NumArgs = Outs.size();
7553
7554    for (unsigned i = 0; i != NumArgs; ++i) {
7555      if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector())
7556        report_fatal_error("Passing SVE types to variadic functions is "
7557                           "currently not supported");
7558    }
7559  }
7560
7561  analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
7562
7563  CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
7564  // Assign locations to each value returned by this call.
7565  SmallVector<CCValAssign, 16> RVLocs;
7566  CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
7567                    *DAG.getContext());
7568  RetCCInfo.AnalyzeCallResult(Ins, RetCC);
7569
7570  // Check callee args/returns for SVE registers and set calling convention
7571  // accordingly.
7572  if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
7573    auto HasSVERegLoc = [](CCValAssign &Loc) {
7574      if (!Loc.isRegLoc())
7575        return false;
7576      return AArch64::ZPRRegClass.contains(Loc.getLocReg()) ||
7577             AArch64::PPRRegClass.contains(Loc.getLocReg());
7578    };
7579    if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc))
7580      CallConv = CallingConv::AArch64_SVE_VectorCall;
7581  }
7582
7583  if (IsTailCall) {
7584    // Check if it's really possible to do a tail call.
7585    IsTailCall = isEligibleForTailCallOptimization(CLI);
7586
7587    // A sibling call is one where we're under the usual C ABI and not planning
7588    // to change that but can still do a tail call:
7589    if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail &&
7590        CallConv != CallingConv::SwiftTail)
7591      IsSibCall = true;
7592
7593    if (IsTailCall)
7594      ++NumTailCalls;
7595  }
7596
7597  if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
7598    report_fatal_error("failed to perform tail call elimination on a call "
7599                       "site marked musttail");
7600
7601  // Get a count of how many bytes are to be pushed on the stack.
7602  unsigned NumBytes = CCInfo.getStackSize();
7603
7604  if (IsSibCall) {
7605    // Since we're not changing the ABI to make this a tail call, the memory
7606    // operands are already available in the caller's incoming argument space.
7607    NumBytes = 0;
7608  }
7609
7610  // FPDiff is the byte offset of the call's argument area from the callee's.
7611  // Stores to callee stack arguments will be placed in FixedStackSlots offset
7612  // by this amount for a tail call. In a sibling call it must be 0 because the
7613  // caller will deallocate the entire stack and the callee still expects its
7614  // arguments to begin at SP+0. Completely unused for non-tail calls.
7615  int FPDiff = 0;
7616
7617  if (IsTailCall && !IsSibCall) {
7618    unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
7619
7620    // Since callee will pop argument stack as a tail call, we must keep the
7621    // popped size 16-byte aligned.
7622    NumBytes = alignTo(NumBytes, 16);
7623
7624    // FPDiff will be negative if this tail call requires more space than we
7625    // would automatically have in our incoming argument space. Positive if we
7626    // can actually shrink the stack.
7627    FPDiff = NumReusableBytes - NumBytes;
7628
7629    // Update the required reserved area if this is the tail call requiring the
7630    // most argument stack space.
7631    if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
7632      FuncInfo->setTailCallReservedStack(-FPDiff);
7633
7634    // The stack pointer must be 16-byte aligned at all times it's used for a
7635    // memory operation, which in practice means at *all* times and in
7636    // particular across call boundaries. Therefore our own arguments started at
7637    // a 16-byte aligned SP and the delta applied for the tail call should
7638    // satisfy the same constraint.
7639    assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
7640  }
7641
7642  // Determine whether we need any streaming mode changes.
7643  SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction());
7644  if (CLI.CB)
7645    CalleeAttrs = SMEAttrs(*CLI.CB);
7646  else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
7647    CalleeAttrs = SMEAttrs(ES->getSymbol());
7648
7649  auto DescribeCallsite =
7650      [&](OptimizationRemarkAnalysis &R) -> OptimizationRemarkAnalysis & {
7651    R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '";
7652    if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
7653      R << ore::NV("Callee", ES->getSymbol());
7654    else if (CLI.CB && CLI.CB->getCalledFunction())
7655      R << ore::NV("Callee", CLI.CB->getCalledFunction()->getName());
7656    else
7657      R << "unknown callee";
7658    R << "'";
7659    return R;
7660  };
7661
7662  bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs);
7663  if (RequiresLazySave) {
7664    unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj();
7665    MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj);
7666    SDValue TPIDR2ObjAddr = DAG.getFrameIndex(TPIDR2Obj,
7667        DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
7668    SDValue NumZaSaveSlicesAddr =
7669        DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
7670                    DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType()));
7671    SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
7672                                          DAG.getConstant(1, DL, MVT::i32));
7673    Chain = DAG.getTruncStore(Chain, DL, NumZaSaveSlices, NumZaSaveSlicesAddr,
7674                              MPI, MVT::i16);
7675    Chain = DAG.getNode(
7676        ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
7677        DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
7678        TPIDR2ObjAddr);
7679    OptimizationRemarkEmitter ORE(&MF.getFunction());
7680    ORE.emit([&]() {
7681      auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
7682                                                   CLI.CB)
7683                      : OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
7684                                                   &MF.getFunction());
7685      return DescribeCallsite(R) << " sets up a lazy save for ZA";
7686    });
7687  }
7688
7689  SDValue PStateSM;
7690  bool RequiresSMChange = CallerAttrs.requiresSMChange(CalleeAttrs);
7691  if (RequiresSMChange) {
7692    if (CallerAttrs.hasStreamingInterfaceOrBody())
7693      PStateSM = DAG.getConstant(1, DL, MVT::i64);
7694    else if (CallerAttrs.hasNonStreamingInterface())
7695      PStateSM = DAG.getConstant(0, DL, MVT::i64);
7696    else
7697      PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
7698    OptimizationRemarkEmitter ORE(&MF.getFunction());
7699    ORE.emit([&]() {
7700      auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
7701                                                   CLI.CB)
7702                      : OptimizationRemarkAnalysis("sme", "SMETransition",
7703                                                   &MF.getFunction());
7704      DescribeCallsite(R) << " requires a streaming mode transition";
7705      return R;
7706    });
7707  }
7708
7709  SDValue ZTFrameIdx;
7710  MachineFrameInfo &MFI = MF.getFrameInfo();
7711  bool ShouldPreserveZT0 = CallerAttrs.requiresPreservingZT0(CalleeAttrs);
7712
7713  // If the caller has ZT0 state which will not be preserved by the callee,
7714  // spill ZT0 before the call.
7715  if (ShouldPreserveZT0) {
7716    unsigned ZTObj = MFI.CreateSpillStackObject(64, Align(16));
7717    ZTFrameIdx = DAG.getFrameIndex(
7718        ZTObj,
7719        DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
7720
7721    Chain = DAG.getNode(AArch64ISD::SAVE_ZT, DL, DAG.getVTList(MVT::Other),
7722                        {Chain, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
7723  }
7724
7725  // If caller shares ZT0 but the callee is not shared ZA, we need to stop
7726  // PSTATE.ZA before the call if there is no lazy-save active.
7727  bool DisableZA = CallerAttrs.requiresDisablingZABeforeCall(CalleeAttrs);
7728  assert((!DisableZA || !RequiresLazySave) &&
7729         "Lazy-save should have PSTATE.SM=1 on entry to the function");
7730
7731  if (DisableZA)
7732    Chain = DAG.getNode(
7733        AArch64ISD::SMSTOP, DL, MVT::Other, Chain,
7734        DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
7735        DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64));
7736
7737  // Adjust the stack pointer for the new arguments...
7738  // These operations are automatically eliminated by the prolog/epilog pass
7739  if (!IsSibCall)
7740    Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
7741
7742  SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
7743                                        getPointerTy(DAG.getDataLayout()));
7744
7745  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
7746  SmallSet<unsigned, 8> RegsUsed;
7747  SmallVector<SDValue, 8> MemOpChains;
7748  auto PtrVT = getPointerTy(DAG.getDataLayout());
7749
7750  if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
7751    const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
7752    for (const auto &F : Forwards) {
7753      SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
7754       RegsToPass.emplace_back(F.PReg, Val);
7755    }
7756  }
7757
7758  // Walk the register/memloc assignments, inserting copies/loads.
7759  unsigned ExtraArgLocs = 0;
7760  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
7761    CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
7762    SDValue Arg = OutVals[i];
7763    ISD::ArgFlagsTy Flags = Outs[i].Flags;
7764
7765    // Promote the value if needed.
7766    switch (VA.getLocInfo()) {
7767    default:
7768      llvm_unreachable("Unknown loc info!");
7769    case CCValAssign::Full:
7770      break;
7771    case CCValAssign::SExt:
7772      Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
7773      break;
7774    case CCValAssign::ZExt:
7775      Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
7776      break;
7777    case CCValAssign::AExt:
7778      if (Outs[i].ArgVT == MVT::i1) {
7779        // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
7780        //
7781        // Check if we actually have to do this, because the value may
7782        // already be zero-extended.
7783        //
7784        // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
7785        // and rely on DAGCombiner to fold this, because the following
7786        // (anyext i32) is combined with (zext i8) in DAG.getNode:
7787        //
7788        //   (ext (zext x)) -> (zext x)
7789        //
7790        // This will give us (zext i32), which we cannot remove, so
7791        // try to check this beforehand.
7792        if (!checkZExtBool(Arg, DAG)) {
7793          Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
7794          Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
7795        }
7796      }
7797      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
7798      break;
7799    case CCValAssign::AExtUpper:
7800      assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
7801      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
7802      Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
7803                        DAG.getConstant(32, DL, VA.getLocVT()));
7804      break;
7805    case CCValAssign::BCvt:
7806      Arg = DAG.getBitcast(VA.getLocVT(), Arg);
7807      break;
7808    case CCValAssign::Trunc:
7809      Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
7810      break;
7811    case CCValAssign::FPExt:
7812      Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
7813      break;
7814    case CCValAssign::Indirect:
7815      bool isScalable = VA.getValVT().isScalableVT();
7816      assert((isScalable || Subtarget->isWindowsArm64EC()) &&
7817             "Indirect arguments should be scalable on most subtargets");
7818
7819      uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue();
7820      uint64_t PartSize = StoreSize;
7821      unsigned NumParts = 1;
7822      if (Outs[i].Flags.isInConsecutiveRegs()) {
7823        assert(!Outs[i].Flags.isInConsecutiveRegsLast());
7824        while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
7825          ++NumParts;
7826        StoreSize *= NumParts;
7827      }
7828
7829      Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
7830      Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
7831      MachineFrameInfo &MFI = MF.getFrameInfo();
7832      int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
7833      if (isScalable)
7834        MFI.setStackID(FI, TargetStackID::ScalableVector);
7835
7836      MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI);
7837      SDValue Ptr = DAG.getFrameIndex(
7838          FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
7839      SDValue SpillSlot = Ptr;
7840
7841      // Ensure we generate all stores for each tuple part, whilst updating the
7842      // pointer after each store correctly using vscale.
7843      while (NumParts) {
7844        SDValue Store = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
7845        MemOpChains.push_back(Store);
7846
7847        NumParts--;
7848        if (NumParts > 0) {
7849          SDValue BytesIncrement;
7850          if (isScalable) {
7851            BytesIncrement = DAG.getVScale(
7852                DL, Ptr.getValueType(),
7853                APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
7854          } else {
7855            BytesIncrement = DAG.getConstant(
7856                APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
7857                Ptr.getValueType());
7858          }
7859          SDNodeFlags Flags;
7860          Flags.setNoUnsignedWrap(true);
7861
7862          MPI = MachinePointerInfo(MPI.getAddrSpace());
7863          Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7864                            BytesIncrement, Flags);
7865          ExtraArgLocs++;
7866          i++;
7867        }
7868      }
7869
7870      Arg = SpillSlot;
7871      break;
7872    }
7873
7874    if (VA.isRegLoc()) {
7875      if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
7876          Outs[0].VT == MVT::i64) {
7877        assert(VA.getLocVT() == MVT::i64 &&
7878               "unexpected calling convention register assignment");
7879        assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
7880               "unexpected use of 'returned'");
7881        IsThisReturn = true;
7882      }
7883      if (RegsUsed.count(VA.getLocReg())) {
7884        // If this register has already been used then we're trying to pack
7885        // parts of an [N x i32] into an X-register. The extension type will
7886        // take care of putting the two halves in the right place but we have to
7887        // combine them.
7888        SDValue &Bits =
7889            llvm::find_if(RegsToPass,
7890                          [=](const std::pair<unsigned, SDValue> &Elt) {
7891                            return Elt.first == VA.getLocReg();
7892                          })
7893                ->second;
7894        Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
7895        // Call site info is used for function's parameter entry value
7896        // tracking. For now we track only simple cases when parameter
7897        // is transferred through whole register.
7898        llvm::erase_if(CSInfo, [&VA](MachineFunction::ArgRegPair ArgReg) {
7899          return ArgReg.Reg == VA.getLocReg();
7900        });
7901      } else {
7902        // Add an extra level of indirection for streaming mode changes by
7903        // using a pseudo copy node that cannot be rematerialised between a
7904        // smstart/smstop and the call by the simple register coalescer.
7905        if (RequiresSMChange && isPassedInFPR(Arg.getValueType()))
7906          Arg = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
7907                            Arg.getValueType(), Arg);
7908        RegsToPass.emplace_back(VA.getLocReg(), Arg);
7909        RegsUsed.insert(VA.getLocReg());
7910        const TargetOptions &Options = DAG.getTarget().Options;
7911        if (Options.EmitCallSiteInfo)
7912          CSInfo.emplace_back(VA.getLocReg(), i);
7913      }
7914    } else {
7915      assert(VA.isMemLoc());
7916
7917      SDValue DstAddr;
7918      MachinePointerInfo DstInfo;
7919
7920      // FIXME: This works on big-endian for composite byvals, which are the
7921      // common case. It should also work for fundamental types too.
7922      uint32_t BEAlign = 0;
7923      unsigned OpSize;
7924      if (VA.getLocInfo() == CCValAssign::Indirect ||
7925          VA.getLocInfo() == CCValAssign::Trunc)
7926        OpSize = VA.getLocVT().getFixedSizeInBits();
7927      else
7928        OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
7929                                 : VA.getValVT().getSizeInBits();
7930      OpSize = (OpSize + 7) / 8;
7931      if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
7932          !Flags.isInConsecutiveRegs()) {
7933        if (OpSize < 8)
7934          BEAlign = 8 - OpSize;
7935      }
7936      unsigned LocMemOffset = VA.getLocMemOffset();
7937      int32_t Offset = LocMemOffset + BEAlign;
7938      SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
7939      PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
7940
7941      if (IsTailCall) {
7942        Offset = Offset + FPDiff;
7943        int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
7944
7945        DstAddr = DAG.getFrameIndex(FI, PtrVT);
7946        DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
7947
7948        // Make sure any stack arguments overlapping with where we're storing
7949        // are loaded before this eventual operation. Otherwise they'll be
7950        // clobbered.
7951        Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
7952      } else {
7953        SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
7954
7955        DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
7956        DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
7957      }
7958
7959      if (Outs[i].Flags.isByVal()) {
7960        SDValue SizeNode =
7961            DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
7962        SDValue Cpy = DAG.getMemcpy(
7963            Chain, DL, DstAddr, Arg, SizeNode,
7964            Outs[i].Flags.getNonZeroByValAlign(),
7965            /*isVol = */ false, /*AlwaysInline = */ false,
7966            /*isTailCall = */ false, DstInfo, MachinePointerInfo());
7967
7968        MemOpChains.push_back(Cpy);
7969      } else {
7970        // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
7971        // promoted to a legal register type i32, we should truncate Arg back to
7972        // i1/i8/i16.
7973        if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
7974            VA.getValVT() == MVT::i16)
7975          Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
7976
7977        SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
7978        MemOpChains.push_back(Store);
7979      }
7980    }
7981  }
7982
7983  if (IsVarArg && Subtarget->isWindowsArm64EC()) {
7984    SDValue ParamPtr = StackPtr;
7985    if (IsTailCall) {
7986      // Create a dummy object at the top of the stack that can be used to get
7987      // the SP after the epilogue
7988      int FI = MF.getFrameInfo().CreateFixedObject(1, FPDiff, true);
7989      ParamPtr = DAG.getFrameIndex(FI, PtrVT);
7990    }
7991
7992    // For vararg calls, the Arm64EC ABI requires values in x4 and x5
7993    // describing the argument list.  x4 contains the address of the
7994    // first stack parameter. x5 contains the size in bytes of all parameters
7995    // passed on the stack.
7996    RegsToPass.emplace_back(AArch64::X4, ParamPtr);
7997    RegsToPass.emplace_back(AArch64::X5,
7998                            DAG.getConstant(NumBytes, DL, MVT::i64));
7999  }
8000
8001  if (!MemOpChains.empty())
8002    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
8003
8004  SDValue InGlue;
8005  if (RequiresSMChange) {
8006    SDValue NewChain =
8007        changeStreamingMode(DAG, DL, CalleeAttrs.hasStreamingInterface(), Chain,
8008                            InGlue, PStateSM, true);
8009    Chain = NewChain.getValue(0);
8010    InGlue = NewChain.getValue(1);
8011  }
8012
8013  // Build a sequence of copy-to-reg nodes chained together with token chain
8014  // and flag operands which copy the outgoing args into the appropriate regs.
8015  for (auto &RegToPass : RegsToPass) {
8016    Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
8017                             RegToPass.second, InGlue);
8018    InGlue = Chain.getValue(1);
8019  }
8020
8021  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
8022  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
8023  // node so that legalize doesn't hack it.
8024  if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
8025    auto GV = G->getGlobal();
8026    unsigned OpFlags =
8027        Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine());
8028    if (OpFlags & AArch64II::MO_GOT) {
8029      Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
8030      Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
8031    } else {
8032      const GlobalValue *GV = G->getGlobal();
8033      Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
8034    }
8035  } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
8036    if (getTargetMachine().getCodeModel() == CodeModel::Large &&
8037        Subtarget->isTargetMachO()) {
8038      const char *Sym = S->getSymbol();
8039      Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT);
8040      Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
8041    } else {
8042      const char *Sym = S->getSymbol();
8043      Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
8044    }
8045  }
8046
8047  // We don't usually want to end the call-sequence here because we would tidy
8048  // the frame up *after* the call, however in the ABI-changing tail-call case
8049  // we've carefully laid out the parameters so that when sp is reset they'll be
8050  // in the correct location.
8051  if (IsTailCall && !IsSibCall) {
8052    Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL);
8053    InGlue = Chain.getValue(1);
8054  }
8055
8056  std::vector<SDValue> Ops;
8057  Ops.push_back(Chain);
8058  Ops.push_back(Callee);
8059
8060  if (IsTailCall) {
8061    // Each tail call may have to adjust the stack by a different amount, so
8062    // this information must travel along with the operation for eventual
8063    // consumption by emitEpilogue.
8064    Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
8065  }
8066
8067  // Add argument registers to the end of the list so that they are known live
8068  // into the call.
8069  for (auto &RegToPass : RegsToPass)
8070    Ops.push_back(DAG.getRegister(RegToPass.first,
8071                                  RegToPass.second.getValueType()));
8072
8073  // Add a register mask operand representing the call-preserved registers.
8074  const uint32_t *Mask;
8075  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8076  if (IsThisReturn) {
8077    // For 'this' returns, use the X0-preserving mask if applicable
8078    Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
8079    if (!Mask) {
8080      IsThisReturn = false;
8081      Mask = TRI->getCallPreservedMask(MF, CallConv);
8082    }
8083  } else
8084    Mask = TRI->getCallPreservedMask(MF, CallConv);
8085
8086  if (Subtarget->hasCustomCallingConv())
8087    TRI->UpdateCustomCallPreservedMask(MF, &Mask);
8088
8089  if (TRI->isAnyArgRegReserved(MF))
8090    TRI->emitReservedArgRegCallError(MF);
8091
8092  assert(Mask && "Missing call preserved mask for calling convention");
8093  Ops.push_back(DAG.getRegisterMask(Mask));
8094
8095  if (InGlue.getNode())
8096    Ops.push_back(InGlue);
8097
8098  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
8099
8100  // If we're doing a tall call, use a TC_RETURN here rather than an
8101  // actual call instruction.
8102  if (IsTailCall) {
8103    MF.getFrameInfo().setHasTailCall();
8104    SDValue Ret = DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
8105
8106    if (IsCFICall)
8107      Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
8108
8109    DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
8110    DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
8111    return Ret;
8112  }
8113
8114  unsigned CallOpc = AArch64ISD::CALL;
8115  // Calls with operand bundle "clang.arc.attachedcall" are special. They should
8116  // be expanded to the call, directly followed by a special marker sequence and
8117  // a call to an ObjC library function.  Use CALL_RVMARKER to do that.
8118  if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
8119    assert(!IsTailCall &&
8120           "tail calls cannot be marked with clang.arc.attachedcall");
8121    CallOpc = AArch64ISD::CALL_RVMARKER;
8122
8123    // Add a target global address for the retainRV/claimRV runtime function
8124    // just before the call target.
8125    Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
8126    auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);
8127    Ops.insert(Ops.begin() + 1, GA);
8128  } else if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
8129    CallOpc = AArch64ISD::CALL_ARM64EC_TO_X64;
8130  } else if (GuardWithBTI) {
8131    CallOpc = AArch64ISD::CALL_BTI;
8132  }
8133
8134  // Returns a chain and a flag for retval copy to use.
8135  Chain = DAG.getNode(CallOpc, DL, NodeTys, Ops);
8136
8137  if (IsCFICall)
8138    Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
8139
8140  DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
8141  InGlue = Chain.getValue(1);
8142  DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
8143
8144  uint64_t CalleePopBytes =
8145      DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
8146
8147  Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, DL);
8148  InGlue = Chain.getValue(1);
8149
8150  // Handle result values, copying them out of physregs into vregs that we
8151  // return.
8152  SDValue Result = LowerCallResult(
8153      Chain, InGlue, CallConv, IsVarArg, RVLocs, DL, DAG, InVals, IsThisReturn,
8154      IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange);
8155
8156  if (!Ins.empty())
8157    InGlue = Result.getValue(Result->getNumValues() - 1);
8158
8159  if (RequiresSMChange) {
8160    assert(PStateSM && "Expected a PStateSM to be set");
8161    Result = changeStreamingMode(DAG, DL, !CalleeAttrs.hasStreamingInterface(),
8162                                 Result, InGlue, PStateSM, false);
8163  }
8164
8165  if (CallerAttrs.requiresEnablingZAAfterCall(CalleeAttrs))
8166    // Unconditionally resume ZA.
8167    Result = DAG.getNode(
8168        AArch64ISD::SMSTART, DL, MVT::Other, Result,
8169        DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
8170        DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64));
8171
8172  if (ShouldPreserveZT0)
8173    Result =
8174        DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
8175                    {Result, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
8176
8177  if (RequiresLazySave) {
8178    // Conditionally restore the lazy save using a pseudo node.
8179    unsigned FI = FuncInfo->getLazySaveTPIDR2Obj();
8180    SDValue RegMask = DAG.getRegisterMask(
8181        TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
8182    SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
8183        "__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout()));
8184    SDValue TPIDR2_EL0 = DAG.getNode(
8185        ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
8186        DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
8187
8188    // Copy the address of the TPIDR2 block into X0 before 'calling' the
8189    // RESTORE_ZA pseudo.
8190    SDValue Glue;
8191    SDValue TPIDR2Block = DAG.getFrameIndex(
8192        FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
8193    Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
8194    Result =
8195        DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
8196                    {Result, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
8197                     RestoreRoutine, RegMask, Result.getValue(1)});
8198
8199    // Finally reset the TPIDR2_EL0 register to 0.
8200    Result = DAG.getNode(
8201        ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
8202        DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
8203        DAG.getConstant(0, DL, MVT::i64));
8204  }
8205
8206  if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0) {
8207    for (unsigned I = 0; I < InVals.size(); ++I) {
8208      // The smstart/smstop is chained as part of the call, but when the
8209      // resulting chain is discarded (which happens when the call is not part
8210      // of a chain, e.g. a call to @llvm.cos()), we need to ensure the
8211      // smstart/smstop is chained to the result value. We can do that by doing
8212      // a vreg -> vreg copy.
8213      Register Reg = MF.getRegInfo().createVirtualRegister(
8214          getRegClassFor(InVals[I].getValueType().getSimpleVT()));
8215      SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]);
8216      InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
8217                                     InVals[I].getValueType());
8218    }
8219  }
8220
8221  return Result;
8222}
8223
8224bool AArch64TargetLowering::CanLowerReturn(
8225    CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
8226    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
8227  CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
8228  SmallVector<CCValAssign, 16> RVLocs;
8229  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
8230  return CCInfo.CheckReturn(Outs, RetCC);
8231}
8232
8233SDValue
8234AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
8235                                   bool isVarArg,
8236                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
8237                                   const SmallVectorImpl<SDValue> &OutVals,
8238                                   const SDLoc &DL, SelectionDAG &DAG) const {
8239  auto &MF = DAG.getMachineFunction();
8240  auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8241
8242  CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
8243  SmallVector<CCValAssign, 16> RVLocs;
8244  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
8245  CCInfo.AnalyzeReturn(Outs, RetCC);
8246
8247  // Copy the result values into the output registers.
8248  SDValue Glue;
8249  SmallVector<std::pair<unsigned, SDValue>, 4> RetVals;
8250  SmallSet<unsigned, 4> RegsUsed;
8251  for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
8252       ++i, ++realRVLocIdx) {
8253    CCValAssign &VA = RVLocs[i];
8254    assert(VA.isRegLoc() && "Can only return in registers!");
8255    SDValue Arg = OutVals[realRVLocIdx];
8256
8257    switch (VA.getLocInfo()) {
8258    default:
8259      llvm_unreachable("Unknown loc info!");
8260    case CCValAssign::Full:
8261      if (Outs[i].ArgVT == MVT::i1) {
8262        // AAPCS requires i1 to be zero-extended to i8 by the producer of the
8263        // value. This is strictly redundant on Darwin (which uses "zeroext
8264        // i1"), but will be optimised out before ISel.
8265        Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
8266        Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
8267      }
8268      break;
8269    case CCValAssign::BCvt:
8270      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
8271      break;
8272    case CCValAssign::AExt:
8273    case CCValAssign::ZExt:
8274      Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
8275      break;
8276    case CCValAssign::AExtUpper:
8277      assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
8278      Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
8279      Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
8280                        DAG.getConstant(32, DL, VA.getLocVT()));
8281      break;
8282    }
8283
8284    if (RegsUsed.count(VA.getLocReg())) {
8285      SDValue &Bits =
8286          llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
8287            return Elt.first == VA.getLocReg();
8288          })->second;
8289      Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
8290    } else {
8291      RetVals.emplace_back(VA.getLocReg(), Arg);
8292      RegsUsed.insert(VA.getLocReg());
8293    }
8294  }
8295
8296  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8297
8298  // Emit SMSTOP before returning from a locally streaming function
8299  SMEAttrs FuncAttrs(MF.getFunction());
8300  if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
8301    if (FuncAttrs.hasStreamingCompatibleInterface()) {
8302      Register Reg = FuncInfo->getPStateSMReg();
8303      assert(Reg.isValid() && "PStateSM Register is invalid");
8304      SDValue PStateSM = DAG.getCopyFromReg(Chain, DL, Reg, MVT::i64);
8305      Chain =
8306          changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
8307                              /*Glue*/ SDValue(), PStateSM, /*Entry*/ false);
8308    } else
8309      Chain = changeStreamingMode(
8310          DAG, DL, /*Enable*/ false, Chain,
8311          /*Glue*/ SDValue(), DAG.getConstant(1, DL, MVT::i64), /*Entry*/ true);
8312    Glue = Chain.getValue(1);
8313  }
8314
8315  SmallVector<SDValue, 4> RetOps(1, Chain);
8316  for (auto &RetVal : RetVals) {
8317    Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Glue);
8318    Glue = Chain.getValue(1);
8319    RetOps.push_back(
8320        DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
8321  }
8322
8323  // Windows AArch64 ABIs require that for returning structs by value we copy
8324  // the sret argument into X0 for the return.
8325  // We saved the argument into a virtual register in the entry block,
8326  // so now we copy the value out and into X0.
8327  if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
8328    SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
8329                                     getPointerTy(MF.getDataLayout()));
8330
8331    unsigned RetValReg = AArch64::X0;
8332    if (CallConv == CallingConv::ARM64EC_Thunk_X64)
8333      RetValReg = AArch64::X8;
8334    Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Glue);
8335    Glue = Chain.getValue(1);
8336
8337    RetOps.push_back(
8338      DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
8339  }
8340
8341  const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
8342  if (I) {
8343    for (; *I; ++I) {
8344      if (AArch64::GPR64RegClass.contains(*I))
8345        RetOps.push_back(DAG.getRegister(*I, MVT::i64));
8346      else if (AArch64::FPR64RegClass.contains(*I))
8347        RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
8348      else
8349        llvm_unreachable("Unexpected register class in CSRsViaCopy!");
8350    }
8351  }
8352
8353  RetOps[0] = Chain; // Update chain.
8354
8355  // Add the glue if we have it.
8356  if (Glue.getNode())
8357    RetOps.push_back(Glue);
8358
8359  if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
8360    // ARM64EC entry thunks use a special return sequence: instead of a regular
8361    // "ret" instruction, they need to explicitly call the emulator.
8362    EVT PtrVT = getPointerTy(DAG.getDataLayout());
8363    SDValue Arm64ECRetDest =
8364        DAG.getExternalSymbol("__os_arm64x_dispatch_ret", PtrVT);
8365    Arm64ECRetDest =
8366        getAddr(cast<ExternalSymbolSDNode>(Arm64ECRetDest), DAG, 0);
8367    Arm64ECRetDest = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Arm64ECRetDest,
8368                                 MachinePointerInfo());
8369    RetOps.insert(RetOps.begin() + 1, Arm64ECRetDest);
8370    RetOps.insert(RetOps.begin() + 2, DAG.getTargetConstant(0, DL, MVT::i32));
8371    return DAG.getNode(AArch64ISD::TC_RETURN, DL, MVT::Other, RetOps);
8372  }
8373
8374  return DAG.getNode(AArch64ISD::RET_GLUE, DL, MVT::Other, RetOps);
8375}
8376
8377//===----------------------------------------------------------------------===//
8378//  Other Lowering Code
8379//===----------------------------------------------------------------------===//
8380
8381SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
8382                                             SelectionDAG &DAG,
8383                                             unsigned Flag) const {
8384  return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
8385                                    N->getOffset(), Flag);
8386}
8387
8388SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
8389                                             SelectionDAG &DAG,
8390                                             unsigned Flag) const {
8391  return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
8392}
8393
8394SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
8395                                             SelectionDAG &DAG,
8396                                             unsigned Flag) const {
8397  return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
8398                                   N->getOffset(), Flag);
8399}
8400
8401SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
8402                                             SelectionDAG &DAG,
8403                                             unsigned Flag) const {
8404  return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
8405}
8406
8407SDValue AArch64TargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
8408                                             SelectionDAG &DAG,
8409                                             unsigned Flag) const {
8410  return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
8411}
8412
8413// (loadGOT sym)
8414template <class NodeTy>
8415SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
8416                                      unsigned Flags) const {
8417  LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
8418  SDLoc DL(N);
8419  EVT Ty = getPointerTy(DAG.getDataLayout());
8420  SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
8421  // FIXME: Once remat is capable of dealing with instructions with register
8422  // operands, expand this into two nodes instead of using a wrapper node.
8423  return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
8424}
8425
8426// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
8427template <class NodeTy>
8428SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
8429                                            unsigned Flags) const {
8430  LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
8431  SDLoc DL(N);
8432  EVT Ty = getPointerTy(DAG.getDataLayout());
8433  const unsigned char MO_NC = AArch64II::MO_NC;
8434  return DAG.getNode(
8435      AArch64ISD::WrapperLarge, DL, Ty,
8436      getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
8437      getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
8438      getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
8439      getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
8440}
8441
8442// (addlow (adrp %hi(sym)) %lo(sym))
8443template <class NodeTy>
8444SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
8445                                       unsigned Flags) const {
8446  LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
8447  SDLoc DL(N);
8448  EVT Ty = getPointerTy(DAG.getDataLayout());
8449  SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
8450  SDValue Lo = getTargetNode(N, Ty, DAG,
8451                             AArch64II::MO_PAGEOFF | AArch64II::MO_NC | Flags);
8452  SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
8453  return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
8454}
8455
8456// (adr sym)
8457template <class NodeTy>
8458SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
8459                                           unsigned Flags) const {
8460  LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
8461  SDLoc DL(N);
8462  EVT Ty = getPointerTy(DAG.getDataLayout());
8463  SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
8464  return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
8465}
8466
8467SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
8468                                                  SelectionDAG &DAG) const {
8469  GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
8470  const GlobalValue *GV = GN->getGlobal();
8471  unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
8472
8473  if (OpFlags != AArch64II::MO_NO_FLAG)
8474    assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
8475           "unexpected offset in global node");
8476
8477  // This also catches the large code model case for Darwin, and tiny code
8478  // model with got relocations.
8479  if ((OpFlags & AArch64II::MO_GOT) != 0) {
8480    return getGOT(GN, DAG, OpFlags);
8481  }
8482
8483  SDValue Result;
8484  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
8485      !getTargetMachine().isPositionIndependent()) {
8486    Result = getAddrLarge(GN, DAG, OpFlags);
8487  } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
8488    Result = getAddrTiny(GN, DAG, OpFlags);
8489  } else {
8490    Result = getAddr(GN, DAG, OpFlags);
8491  }
8492  EVT PtrVT = getPointerTy(DAG.getDataLayout());
8493  SDLoc DL(GN);
8494  if (OpFlags & (AArch64II::MO_DLLIMPORT | AArch64II::MO_COFFSTUB))
8495    Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
8496                         MachinePointerInfo::getGOT(DAG.getMachineFunction()));
8497  return Result;
8498}
8499
8500/// Convert a TLS address reference into the correct sequence of loads
8501/// and calls to compute the variable's address (for Darwin, currently) and
8502/// return an SDValue containing the final node.
8503
8504/// Darwin only has one TLS scheme which must be capable of dealing with the
8505/// fully general situation, in the worst case. This means:
8506///     + "extern __thread" declaration.
8507///     + Defined in a possibly unknown dynamic library.
8508///
8509/// The general system is that each __thread variable has a [3 x i64] descriptor
8510/// which contains information used by the runtime to calculate the address. The
8511/// only part of this the compiler needs to know about is the first xword, which
8512/// contains a function pointer that must be called with the address of the
8513/// entire descriptor in "x0".
8514///
8515/// Since this descriptor may be in a different unit, in general even the
8516/// descriptor must be accessed via an indirect load. The "ideal" code sequence
8517/// is:
8518///     adrp x0, _var@TLVPPAGE
8519///     ldr x0, [x0, _var@TLVPPAGEOFF]   ; x0 now contains address of descriptor
8520///     ldr x1, [x0]                     ; x1 contains 1st entry of descriptor,
8521///                                      ; the function pointer
8522///     blr x1                           ; Uses descriptor address in x0
8523///     ; Address of _var is now in x0.
8524///
8525/// If the address of _var's descriptor *is* known to the linker, then it can
8526/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
8527/// a slight efficiency gain.
8528SDValue
8529AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
8530                                                   SelectionDAG &DAG) const {
8531  assert(Subtarget->isTargetDarwin() &&
8532         "This function expects a Darwin target");
8533
8534  SDLoc DL(Op);
8535  MVT PtrVT = getPointerTy(DAG.getDataLayout());
8536  MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
8537  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
8538
8539  SDValue TLVPAddr =
8540      DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
8541  SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
8542
8543  // The first entry in the descriptor is a function pointer that we must call
8544  // to obtain the address of the variable.
8545  SDValue Chain = DAG.getEntryNode();
8546  SDValue FuncTLVGet = DAG.getLoad(
8547      PtrMemVT, DL, Chain, DescAddr,
8548      MachinePointerInfo::getGOT(DAG.getMachineFunction()),
8549      Align(PtrMemVT.getSizeInBits() / 8),
8550      MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable);
8551  Chain = FuncTLVGet.getValue(1);
8552
8553  // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
8554  FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
8555
8556  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
8557  MFI.setAdjustsStack(true);
8558
8559  // TLS calls preserve all registers except those that absolutely must be
8560  // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
8561  // silly).
8562  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8563  const uint32_t *Mask = TRI->getTLSCallPreservedMask();
8564  if (Subtarget->hasCustomCallingConv())
8565    TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
8566
8567  // Finally, we can make the call. This is just a degenerate version of a
8568  // normal AArch64 call node: x0 takes the address of the descriptor, and
8569  // returns the address of the variable in this thread.
8570  Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
8571  Chain =
8572      DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
8573                  Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
8574                  DAG.getRegisterMask(Mask), Chain.getValue(1));
8575  return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
8576}
8577
8578/// Convert a thread-local variable reference into a sequence of instructions to
8579/// compute the variable's address for the local exec TLS model of ELF targets.
8580/// The sequence depends on the maximum TLS area size.
8581SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
8582                                                    SDValue ThreadBase,
8583                                                    const SDLoc &DL,
8584                                                    SelectionDAG &DAG) const {
8585  EVT PtrVT = getPointerTy(DAG.getDataLayout());
8586  SDValue TPOff, Addr;
8587
8588  switch (DAG.getTarget().Options.TLSSize) {
8589  default:
8590    llvm_unreachable("Unexpected TLS size");
8591
8592  case 12: {
8593    // mrs   x0, TPIDR_EL0
8594    // add   x0, x0, :tprel_lo12:a
8595    SDValue Var = DAG.getTargetGlobalAddress(
8596        GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
8597    return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
8598                                      Var,
8599                                      DAG.getTargetConstant(0, DL, MVT::i32)),
8600                   0);
8601  }
8602
8603  case 24: {
8604    // mrs   x0, TPIDR_EL0
8605    // add   x0, x0, :tprel_hi12:a
8606    // add   x0, x0, :tprel_lo12_nc:a
8607    SDValue HiVar = DAG.getTargetGlobalAddress(
8608        GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
8609    SDValue LoVar = DAG.getTargetGlobalAddress(
8610        GV, DL, PtrVT, 0,
8611        AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
8612    Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
8613                                      HiVar,
8614                                      DAG.getTargetConstant(0, DL, MVT::i32)),
8615                   0);
8616    return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
8617                                      LoVar,
8618                                      DAG.getTargetConstant(0, DL, MVT::i32)),
8619                   0);
8620  }
8621
8622  case 32: {
8623    // mrs   x1, TPIDR_EL0
8624    // movz  x0, #:tprel_g1:a
8625    // movk  x0, #:tprel_g0_nc:a
8626    // add   x0, x1, x0
8627    SDValue HiVar = DAG.getTargetGlobalAddress(
8628        GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
8629    SDValue LoVar = DAG.getTargetGlobalAddress(
8630        GV, DL, PtrVT, 0,
8631        AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
8632    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
8633                                       DAG.getTargetConstant(16, DL, MVT::i32)),
8634                    0);
8635    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
8636                                       DAG.getTargetConstant(0, DL, MVT::i32)),
8637                    0);
8638    return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
8639  }
8640
8641  case 48: {
8642    // mrs   x1, TPIDR_EL0
8643    // movz  x0, #:tprel_g2:a
8644    // movk  x0, #:tprel_g1_nc:a
8645    // movk  x0, #:tprel_g0_nc:a
8646    // add   x0, x1, x0
8647    SDValue HiVar = DAG.getTargetGlobalAddress(
8648        GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
8649    SDValue MiVar = DAG.getTargetGlobalAddress(
8650        GV, DL, PtrVT, 0,
8651        AArch64II::MO_TLS | AArch64II::MO_G1 | AArch64II::MO_NC);
8652    SDValue LoVar = DAG.getTargetGlobalAddress(
8653        GV, DL, PtrVT, 0,
8654        AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
8655    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
8656                                       DAG.getTargetConstant(32, DL, MVT::i32)),
8657                    0);
8658    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
8659                                       DAG.getTargetConstant(16, DL, MVT::i32)),
8660                    0);
8661    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
8662                                       DAG.getTargetConstant(0, DL, MVT::i32)),
8663                    0);
8664    return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
8665  }
8666  }
8667}
8668
8669/// When accessing thread-local variables under either the general-dynamic or
8670/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
8671/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
8672/// is a function pointer to carry out the resolution.
8673///
8674/// The sequence is:
8675///    adrp  x0, :tlsdesc:var
8676///    ldr   x1, [x0, #:tlsdesc_lo12:var]
8677///    add   x0, x0, #:tlsdesc_lo12:var
8678///    .tlsdesccall var
8679///    blr   x1
8680///    (TPIDR_EL0 offset now in x0)
8681///
8682///  The above sequence must be produced unscheduled, to enable the linker to
8683///  optimize/relax this sequence.
8684///  Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
8685///  above sequence, and expanded really late in the compilation flow, to ensure
8686///  the sequence is produced as per above.
8687SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
8688                                                      const SDLoc &DL,
8689                                                      SelectionDAG &DAG) const {
8690  EVT PtrVT = getPointerTy(DAG.getDataLayout());
8691
8692  SDValue Chain = DAG.getEntryNode();
8693  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
8694
8695  Chain =
8696      DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
8697  SDValue Glue = Chain.getValue(1);
8698
8699  return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
8700}
8701
8702SDValue
8703AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
8704                                                SelectionDAG &DAG) const {
8705  assert(Subtarget->isTargetELF() && "This function expects an ELF target");
8706
8707  const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
8708
8709  TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
8710
8711  if (!EnableAArch64ELFLocalDynamicTLSGeneration) {
8712    if (Model == TLSModel::LocalDynamic)
8713      Model = TLSModel::GeneralDynamic;
8714  }
8715
8716  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
8717      Model != TLSModel::LocalExec)
8718    report_fatal_error("ELF TLS only supported in small memory model or "
8719                       "in local exec TLS model");
8720  // Different choices can be made for the maximum size of the TLS area for a
8721  // module. For the small address model, the default TLS size is 16MiB and the
8722  // maximum TLS size is 4GiB.
8723  // FIXME: add tiny and large code model support for TLS access models other
8724  // than local exec. We currently generate the same code as small for tiny,
8725  // which may be larger than needed.
8726
8727  SDValue TPOff;
8728  EVT PtrVT = getPointerTy(DAG.getDataLayout());
8729  SDLoc DL(Op);
8730  const GlobalValue *GV = GA->getGlobal();
8731
8732  SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
8733
8734  if (Model == TLSModel::LocalExec) {
8735    return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
8736  } else if (Model == TLSModel::InitialExec) {
8737    TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
8738    TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
8739  } else if (Model == TLSModel::LocalDynamic) {
8740    // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
8741    // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
8742    // the beginning of the module's TLS region, followed by a DTPREL offset
8743    // calculation.
8744
8745    // These accesses will need deduplicating if there's more than one.
8746    AArch64FunctionInfo *MFI =
8747        DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
8748    MFI->incNumLocalDynamicTLSAccesses();
8749
8750    // The call needs a relocation too for linker relaxation. It doesn't make
8751    // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
8752    // the address.
8753    SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
8754                                                  AArch64II::MO_TLS);
8755
8756    // Now we can calculate the offset from TPIDR_EL0 to this module's
8757    // thread-local area.
8758    TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
8759
8760    // Now use :dtprel_whatever: operations to calculate this variable's offset
8761    // in its thread-storage area.
8762    SDValue HiVar = DAG.getTargetGlobalAddress(
8763        GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
8764    SDValue LoVar = DAG.getTargetGlobalAddress(
8765        GV, DL, MVT::i64, 0,
8766        AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
8767
8768    TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
8769                                       DAG.getTargetConstant(0, DL, MVT::i32)),
8770                    0);
8771    TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
8772                                       DAG.getTargetConstant(0, DL, MVT::i32)),
8773                    0);
8774  } else if (Model == TLSModel::GeneralDynamic) {
8775    // The call needs a relocation too for linker relaxation. It doesn't make
8776    // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
8777    // the address.
8778    SDValue SymAddr =
8779        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
8780
8781    // Finally we can make a call to calculate the offset from tpidr_el0.
8782    TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
8783  } else
8784    llvm_unreachable("Unsupported ELF TLS access model");
8785
8786  return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
8787}
8788
8789SDValue
8790AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
8791                                                    SelectionDAG &DAG) const {
8792  assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
8793
8794  SDValue Chain = DAG.getEntryNode();
8795  EVT PtrVT = getPointerTy(DAG.getDataLayout());
8796  SDLoc DL(Op);
8797
8798  SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
8799
8800  // Load the ThreadLocalStoragePointer from the TEB
8801  // A pointer to the TLS array is located at offset 0x58 from the TEB.
8802  SDValue TLSArray =
8803      DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
8804  TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
8805  Chain = TLSArray.getValue(1);
8806
8807  // Load the TLS index from the C runtime;
8808  // This does the same as getAddr(), but without having a GlobalAddressSDNode.
8809  // This also does the same as LOADgot, but using a generic i32 load,
8810  // while LOADgot only loads i64.
8811  SDValue TLSIndexHi =
8812      DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
8813  SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
8814      "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
8815  SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
8816  SDValue TLSIndex =
8817      DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
8818  TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
8819  Chain = TLSIndex.getValue(1);
8820
8821  // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
8822  // offset into the TLSArray.
8823  TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
8824  SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
8825                             DAG.getConstant(3, DL, PtrVT));
8826  SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
8827                            DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
8828                            MachinePointerInfo());
8829  Chain = TLS.getValue(1);
8830
8831  const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
8832  const GlobalValue *GV = GA->getGlobal();
8833  SDValue TGAHi = DAG.getTargetGlobalAddress(
8834      GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
8835  SDValue TGALo = DAG.getTargetGlobalAddress(
8836      GV, DL, PtrVT, 0,
8837      AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
8838
8839  // Add the offset from the start of the .tls section (section base).
8840  SDValue Addr =
8841      SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
8842                                 DAG.getTargetConstant(0, DL, MVT::i32)),
8843              0);
8844  Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
8845  return Addr;
8846}
8847
8848SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
8849                                                     SelectionDAG &DAG) const {
8850  const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
8851  if (DAG.getTarget().useEmulatedTLS())
8852    return LowerToTLSEmulatedModel(GA, DAG);
8853
8854  if (Subtarget->isTargetDarwin())
8855    return LowerDarwinGlobalTLSAddress(Op, DAG);
8856  if (Subtarget->isTargetELF())
8857    return LowerELFGlobalTLSAddress(Op, DAG);
8858  if (Subtarget->isTargetWindows())
8859    return LowerWindowsGlobalTLSAddress(Op, DAG);
8860
8861  llvm_unreachable("Unexpected platform trying to use TLS");
8862}
8863
8864// Looks through \param Val to determine the bit that can be used to
8865// check the sign of the value. It returns the unextended value and
8866// the sign bit position.
8867std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
8868  if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
8869    return {Val.getOperand(0),
8870            cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
8871                1};
8872
8873  if (Val.getOpcode() == ISD::SIGN_EXTEND)
8874    return {Val.getOperand(0),
8875            Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
8876
8877  return {Val, Val.getValueSizeInBits() - 1};
8878}
8879
8880SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
8881  SDValue Chain = Op.getOperand(0);
8882  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
8883  SDValue LHS = Op.getOperand(2);
8884  SDValue RHS = Op.getOperand(3);
8885  SDValue Dest = Op.getOperand(4);
8886  SDLoc dl(Op);
8887
8888  MachineFunction &MF = DAG.getMachineFunction();
8889  // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
8890  // will not be produced, as they are conditional branch instructions that do
8891  // not set flags.
8892  bool ProduceNonFlagSettingCondBr =
8893      !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
8894
8895  // Handle f128 first, since lowering it will result in comparing the return
8896  // value of a libcall against zero, which is just what the rest of LowerBR_CC
8897  // is expecting to deal with.
8898  if (LHS.getValueType() == MVT::f128) {
8899    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
8900
8901    // If softenSetCCOperands returned a scalar, we need to compare the result
8902    // against zero to select between true and false values.
8903    if (!RHS.getNode()) {
8904      RHS = DAG.getConstant(0, dl, LHS.getValueType());
8905      CC = ISD::SETNE;
8906    }
8907  }
8908
8909  // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
8910  // instruction.
8911  if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
8912      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
8913    // Only lower legal XALUO ops.
8914    if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
8915      return SDValue();
8916
8917    // The actual operation with overflow check.
8918    AArch64CC::CondCode OFCC;
8919    SDValue Value, Overflow;
8920    std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
8921
8922    if (CC == ISD::SETNE)
8923      OFCC = getInvertedCondCode(OFCC);
8924    SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);
8925
8926    return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
8927                       Overflow);
8928  }
8929
8930  if (LHS.getValueType().isInteger()) {
8931    assert((LHS.getValueType() == RHS.getValueType()) &&
8932           (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
8933
8934    // If the RHS of the comparison is zero, we can potentially fold this
8935    // to a specialized branch.
8936    const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
8937    if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
8938      if (CC == ISD::SETEQ) {
8939        // See if we can use a TBZ to fold in an AND as well.
8940        // TBZ has a smaller branch displacement than CBZ.  If the offset is
8941        // out of bounds, a late MI-layer pass rewrites branches.
8942        // 403.gcc is an example that hits this case.
8943        if (LHS.getOpcode() == ISD::AND &&
8944            isa<ConstantSDNode>(LHS.getOperand(1)) &&
8945            isPowerOf2_64(LHS.getConstantOperandVal(1))) {
8946          SDValue Test = LHS.getOperand(0);
8947          uint64_t Mask = LHS.getConstantOperandVal(1);
8948          return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
8949                             DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
8950                             Dest);
8951        }
8952
8953        return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
8954      } else if (CC == ISD::SETNE) {
8955        // See if we can use a TBZ to fold in an AND as well.
8956        // TBZ has a smaller branch displacement than CBZ.  If the offset is
8957        // out of bounds, a late MI-layer pass rewrites branches.
8958        // 403.gcc is an example that hits this case.
8959        if (LHS.getOpcode() == ISD::AND &&
8960            isa<ConstantSDNode>(LHS.getOperand(1)) &&
8961            isPowerOf2_64(LHS.getConstantOperandVal(1))) {
8962          SDValue Test = LHS.getOperand(0);
8963          uint64_t Mask = LHS.getConstantOperandVal(1);
8964          return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
8965                             DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
8966                             Dest);
8967        }
8968
8969        return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
8970      } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
8971        // Don't combine AND since emitComparison converts the AND to an ANDS
8972        // (a.k.a. TST) and the test in the test bit and branch instruction
8973        // becomes redundant.  This would also increase register pressure.
8974        uint64_t SignBitPos;
8975        std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
8976        return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
8977                           DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
8978      }
8979    }
8980    if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
8981        LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
8982      // Don't combine AND since emitComparison converts the AND to an ANDS
8983      // (a.k.a. TST) and the test in the test bit and branch instruction
8984      // becomes redundant.  This would also increase register pressure.
8985      uint64_t SignBitPos;
8986      std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
8987      return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
8988                         DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
8989    }
8990
8991    SDValue CCVal;
8992    SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
8993    return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
8994                       Cmp);
8995  }
8996
8997  assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
8998         LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
8999
9000  // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
9001  // clean.  Some of them require two branches to implement.
9002  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
9003  AArch64CC::CondCode CC1, CC2;
9004  changeFPCCToAArch64CC(CC, CC1, CC2);
9005  SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9006  SDValue BR1 =
9007      DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
9008  if (CC2 != AArch64CC::AL) {
9009    SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
9010    return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
9011                       Cmp);
9012  }
9013
9014  return BR1;
9015}
9016
9017SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
9018                                              SelectionDAG &DAG) const {
9019  if (!Subtarget->hasNEON())
9020    return SDValue();
9021
9022  EVT VT = Op.getValueType();
9023  EVT IntVT = VT.changeTypeToInteger();
9024  SDLoc DL(Op);
9025
9026  SDValue In1 = Op.getOperand(0);
9027  SDValue In2 = Op.getOperand(1);
9028  EVT SrcVT = In2.getValueType();
9029
9030  if (!SrcVT.bitsEq(VT))
9031    In2 = DAG.getFPExtendOrRound(In2, DL, VT);
9032
9033  if (VT.isScalableVector())
9034    IntVT =
9035        getPackedSVEVectorVT(VT.getVectorElementType().changeTypeToInteger());
9036
9037  if (VT.isFixedLengthVector() &&
9038      useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
9039    EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
9040
9041    In1 = convertToScalableVector(DAG, ContainerVT, In1);
9042    In2 = convertToScalableVector(DAG, ContainerVT, In2);
9043
9044    SDValue Res = DAG.getNode(ISD::FCOPYSIGN, DL, ContainerVT, In1, In2);
9045    return convertFromScalableVector(DAG, VT, Res);
9046  }
9047
9048  auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
9049    if (VT.isScalableVector())
9050      return getSVESafeBitCast(VT, Op, DAG);
9051
9052    return DAG.getBitcast(VT, Op);
9053  };
9054
9055  SDValue VecVal1, VecVal2;
9056  EVT VecVT;
9057  auto SetVecVal = [&](int Idx = -1) {
9058    if (!VT.isVector()) {
9059      VecVal1 =
9060          DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1);
9061      VecVal2 =
9062          DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2);
9063    } else {
9064      VecVal1 = BitCast(VecVT, In1, DAG);
9065      VecVal2 = BitCast(VecVT, In2, DAG);
9066    }
9067  };
9068  if (VT.isVector()) {
9069    VecVT = IntVT;
9070    SetVecVal();
9071  } else if (VT == MVT::f64) {
9072    VecVT = MVT::v2i64;
9073    SetVecVal(AArch64::dsub);
9074  } else if (VT == MVT::f32) {
9075    VecVT = MVT::v4i32;
9076    SetVecVal(AArch64::ssub);
9077  } else if (VT == MVT::f16) {
9078    VecVT = MVT::v8i16;
9079    SetVecVal(AArch64::hsub);
9080  } else {
9081    llvm_unreachable("Invalid type for copysign!");
9082  }
9083
9084  unsigned BitWidth = In1.getScalarValueSizeInBits();
9085  SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT);
9086
9087  // We want to materialize a mask with every bit but the high bit set, but the
9088  // AdvSIMD immediate moves cannot materialize that in a single instruction for
9089  // 64-bit elements. Instead, materialize all bits set and then negate that.
9090  if (VT == MVT::f64 || VT == MVT::v2f64) {
9091    SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT);
9092    SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);
9093    SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);
9094    SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);
9095  }
9096
9097  SDValue BSP =
9098      DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2);
9099  if (VT == MVT::f16)
9100    return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
9101  if (VT == MVT::f32)
9102    return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
9103  if (VT == MVT::f64)
9104    return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);
9105
9106  return BitCast(VT, BSP, DAG);
9107}
9108
9109SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
9110                                                 SelectionDAG &DAG) const {
9111  if (DAG.getMachineFunction().getFunction().hasFnAttribute(
9112          Attribute::NoImplicitFloat))
9113    return SDValue();
9114
9115  if (!Subtarget->hasNEON())
9116    return SDValue();
9117
9118  bool IsParity = Op.getOpcode() == ISD::PARITY;
9119  SDValue Val = Op.getOperand(0);
9120  SDLoc DL(Op);
9121  EVT VT = Op.getValueType();
9122
9123  // for i32, general parity function using EORs is more efficient compared to
9124  // using floating point
9125  if (VT == MVT::i32 && IsParity)
9126    return SDValue();
9127
9128  // If there is no CNT instruction available, GPR popcount can
9129  // be more efficiently lowered to the following sequence that uses
9130  // AdvSIMD registers/instructions as long as the copies to/from
9131  // the AdvSIMD registers are cheap.
9132  //  FMOV    D0, X0        // copy 64-bit int to vector, high bits zero'd
9133  //  CNT     V0.8B, V0.8B  // 8xbyte pop-counts
9134  //  ADDV    B0, V0.8B     // sum 8xbyte pop-counts
9135  //  UMOV    X0, V0.B[0]   // copy byte result back to integer reg
9136  if (VT == MVT::i32 || VT == MVT::i64) {
9137    if (VT == MVT::i32)
9138      Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
9139    Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
9140
9141    SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
9142    SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop);
9143    UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV,
9144                         DAG.getConstant(0, DL, MVT::i64));
9145
9146    if (IsParity)
9147      UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
9148                           DAG.getConstant(1, DL, MVT::i32));
9149
9150    if (VT == MVT::i64)
9151      UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
9152    return UaddLV;
9153  } else if (VT == MVT::i128) {
9154    Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
9155
9156    SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
9157    SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop);
9158    UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV,
9159                         DAG.getConstant(0, DL, MVT::i64));
9160
9161    if (IsParity)
9162      UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
9163                           DAG.getConstant(1, DL, MVT::i32));
9164
9165    return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV);
9166  }
9167
9168  assert(!IsParity && "ISD::PARITY of vector types not supported");
9169
9170  if (VT.isScalableVector() ||
9171      useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
9172    return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
9173
9174  assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
9175          VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
9176         "Unexpected type for custom ctpop lowering");
9177
9178  EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
9179  Val = DAG.getBitcast(VT8Bit, Val);
9180  Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
9181
9182  // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
9183  unsigned EltSize = 8;
9184  unsigned NumElts = VT.is64BitVector() ? 8 : 16;
9185  while (EltSize != VT.getScalarSizeInBits()) {
9186    EltSize *= 2;
9187    NumElts /= 2;
9188    MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
9189    Val = DAG.getNode(
9190        ISD::INTRINSIC_WO_CHAIN, DL, WidenVT,
9191        DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val);
9192  }
9193
9194  return Val;
9195}
9196
9197SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
9198  EVT VT = Op.getValueType();
9199  assert(VT.isScalableVector() ||
9200         useSVEForFixedLengthVectorVT(
9201             VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()));
9202
9203  SDLoc DL(Op);
9204  SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
9205  return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
9206}
9207
9208SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
9209                                           SelectionDAG &DAG) const {
9210
9211  EVT VT = Op.getValueType();
9212  SDLoc DL(Op);
9213  unsigned Opcode = Op.getOpcode();
9214  ISD::CondCode CC;
9215  switch (Opcode) {
9216  default:
9217    llvm_unreachable("Wrong instruction");
9218  case ISD::SMAX:
9219    CC = ISD::SETGT;
9220    break;
9221  case ISD::SMIN:
9222    CC = ISD::SETLT;
9223    break;
9224  case ISD::UMAX:
9225    CC = ISD::SETUGT;
9226    break;
9227  case ISD::UMIN:
9228    CC = ISD::SETULT;
9229    break;
9230  }
9231
9232  if (VT.isScalableVector() ||
9233      useSVEForFixedLengthVectorVT(
9234          VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
9235    switch (Opcode) {
9236    default:
9237      llvm_unreachable("Wrong instruction");
9238    case ISD::SMAX:
9239      return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED);
9240    case ISD::SMIN:
9241      return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);
9242    case ISD::UMAX:
9243      return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);
9244    case ISD::UMIN:
9245      return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);
9246    }
9247  }
9248
9249  SDValue Op0 = Op.getOperand(0);
9250  SDValue Op1 = Op.getOperand(1);
9251  SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);
9252  return DAG.getSelect(DL, VT, Cond, Op0, Op1);
9253}
9254
9255SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
9256                                               SelectionDAG &DAG) const {
9257  EVT VT = Op.getValueType();
9258
9259  if (VT.isScalableVector() ||
9260      useSVEForFixedLengthVectorVT(
9261          VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
9262    return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
9263
9264  SDLoc DL(Op);
9265  SDValue REVB;
9266  MVT VST;
9267
9268  switch (VT.getSimpleVT().SimpleTy) {
9269  default:
9270    llvm_unreachable("Invalid type for bitreverse!");
9271
9272  case MVT::v2i32: {
9273    VST = MVT::v8i8;
9274    REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
9275
9276    break;
9277  }
9278
9279  case MVT::v4i32: {
9280    VST = MVT::v16i8;
9281    REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
9282
9283    break;
9284  }
9285
9286  case MVT::v1i64: {
9287    VST = MVT::v8i8;
9288    REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
9289
9290    break;
9291  }
9292
9293  case MVT::v2i64: {
9294    VST = MVT::v16i8;
9295    REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
9296
9297    break;
9298  }
9299  }
9300
9301  return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
9302                     DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));
9303}
9304
9305// Check whether the continuous comparison sequence.
9306static bool
9307isOrXorChain(SDValue N, unsigned &Num,
9308             SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) {
9309  if (Num == MaxXors)
9310    return false;
9311
9312  // Skip the one-use zext
9313  if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse())
9314    N = N->getOperand(0);
9315
9316  // The leaf node must be XOR
9317  if (N->getOpcode() == ISD::XOR) {
9318    WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1)));
9319    Num++;
9320    return true;
9321  }
9322
9323  // All the non-leaf nodes must be OR.
9324  if (N->getOpcode() != ISD::OR || !N->hasOneUse())
9325    return false;
9326
9327  if (isOrXorChain(N->getOperand(0), Num, WorkList) &&
9328      isOrXorChain(N->getOperand(1), Num, WorkList))
9329    return true;
9330  return false;
9331}
9332
9333// Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.
9334static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG) {
9335  SDValue LHS = N->getOperand(0);
9336  SDValue RHS = N->getOperand(1);
9337  SDLoc DL(N);
9338  EVT VT = N->getValueType(0);
9339  SmallVector<std::pair<SDValue, SDValue>, 16> WorkList;
9340
9341  // Only handle integer compares.
9342  if (N->getOpcode() != ISD::SETCC)
9343    return SDValue();
9344
9345  ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
9346  // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
9347  // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag
9348  unsigned NumXors = 0;
9349  if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
9350      LHS->getOpcode() == ISD::OR && LHS->hasOneUse() &&
9351      isOrXorChain(LHS, NumXors, WorkList)) {
9352    SDValue XOR0, XOR1;
9353    std::tie(XOR0, XOR1) = WorkList[0];
9354    unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;
9355    SDValue Cmp = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
9356    for (unsigned I = 1; I < WorkList.size(); I++) {
9357      std::tie(XOR0, XOR1) = WorkList[I];
9358      SDValue CmpChain = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
9359      Cmp = DAG.getNode(LogicOp, DL, VT, Cmp, CmpChain);
9360    }
9361
9362    // Exit early by inverting the condition, which help reduce indentations.
9363    return Cmp;
9364  }
9365
9366  return SDValue();
9367}
9368
9369SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
9370
9371  if (Op.getValueType().isVector())
9372    return LowerVSETCC(Op, DAG);
9373
9374  bool IsStrict = Op->isStrictFPOpcode();
9375  bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
9376  unsigned OpNo = IsStrict ? 1 : 0;
9377  SDValue Chain;
9378  if (IsStrict)
9379    Chain = Op.getOperand(0);
9380  SDValue LHS = Op.getOperand(OpNo + 0);
9381  SDValue RHS = Op.getOperand(OpNo + 1);
9382  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
9383  SDLoc dl(Op);
9384
9385  // We chose ZeroOrOneBooleanContents, so use zero and one.
9386  EVT VT = Op.getValueType();
9387  SDValue TVal = DAG.getConstant(1, dl, VT);
9388  SDValue FVal = DAG.getConstant(0, dl, VT);
9389
9390  // Handle f128 first, since one possible outcome is a normal integer
9391  // comparison which gets picked up by the next if statement.
9392  if (LHS.getValueType() == MVT::f128) {
9393    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain,
9394                        IsSignaling);
9395
9396    // If softenSetCCOperands returned a scalar, use it.
9397    if (!RHS.getNode()) {
9398      assert(LHS.getValueType() == Op.getValueType() &&
9399             "Unexpected setcc expansion!");
9400      return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS;
9401    }
9402  }
9403
9404  if (LHS.getValueType().isInteger()) {
9405    SDValue CCVal;
9406    SDValue Cmp = getAArch64Cmp(
9407        LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl);
9408
9409    // Note that we inverted the condition above, so we reverse the order of
9410    // the true and false operands here.  This will allow the setcc to be
9411    // matched to a single CSINC instruction.
9412    SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
9413    return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
9414  }
9415
9416  // Now we know we're dealing with FP values.
9417  assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
9418         LHS.getValueType() == MVT::f64);
9419
9420  // If that fails, we'll need to perform an FCMP + CSEL sequence.  Go ahead
9421  // and do the comparison.
9422  SDValue Cmp;
9423  if (IsStrict)
9424    Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling);
9425  else
9426    Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
9427
9428  AArch64CC::CondCode CC1, CC2;
9429  changeFPCCToAArch64CC(CC, CC1, CC2);
9430  SDValue Res;
9431  if (CC2 == AArch64CC::AL) {
9432    changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
9433                          CC2);
9434    SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9435
9436    // Note that we inverted the condition above, so we reverse the order of
9437    // the true and false operands here.  This will allow the setcc to be
9438    // matched to a single CSINC instruction.
9439    Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
9440  } else {
9441    // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
9442    // totally clean.  Some of them require two CSELs to implement.  As is in
9443    // this case, we emit the first CSEL and then emit a second using the output
9444    // of the first as the RHS.  We're effectively OR'ing the two CC's together.
9445
9446    // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
9447    SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9448    SDValue CS1 =
9449        DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
9450
9451    SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
9452    Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
9453  }
9454  return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res;
9455}
9456
9457SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
9458                                               SelectionDAG &DAG) const {
9459
9460  SDValue LHS = Op.getOperand(0);
9461  SDValue RHS = Op.getOperand(1);
9462  EVT VT = LHS.getValueType();
9463  if (VT != MVT::i32 && VT != MVT::i64)
9464    return SDValue();
9465
9466  SDLoc DL(Op);
9467  SDValue Carry = Op.getOperand(2);
9468  // SBCS uses a carry not a borrow so the carry flag should be inverted first.
9469  SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
9470  SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, MVT::Glue),
9471                            LHS, RHS, InvCarry);
9472
9473  EVT OpVT = Op.getValueType();
9474  SDValue TVal = DAG.getConstant(1, DL, OpVT);
9475  SDValue FVal = DAG.getConstant(0, DL, OpVT);
9476
9477  ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get();
9478  ISD::CondCode CondInv = ISD::getSetCCInverse(Cond, VT);
9479  SDValue CCVal =
9480      DAG.getConstant(changeIntCCToAArch64CC(CondInv), DL, MVT::i32);
9481  // Inputs are swapped because the condition is inverted. This will allow
9482  // matching with a single CSINC instruction.
9483  return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal,
9484                     Cmp.getValue(1));
9485}
9486
9487SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
9488                                              SDValue RHS, SDValue TVal,
9489                                              SDValue FVal, const SDLoc &dl,
9490                                              SelectionDAG &DAG) const {
9491  // Handle f128 first, because it will result in a comparison of some RTLIB
9492  // call result against zero.
9493  if (LHS.getValueType() == MVT::f128) {
9494    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
9495
9496    // If softenSetCCOperands returned a scalar, we need to compare the result
9497    // against zero to select between true and false values.
9498    if (!RHS.getNode()) {
9499      RHS = DAG.getConstant(0, dl, LHS.getValueType());
9500      CC = ISD::SETNE;
9501    }
9502  }
9503
9504  // Also handle f16, for which we need to do a f32 comparison.
9505  if (LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
9506    LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
9507    RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
9508  }
9509
9510  // Next, handle integers.
9511  if (LHS.getValueType().isInteger()) {
9512    assert((LHS.getValueType() == RHS.getValueType()) &&
9513           (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
9514
9515    ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
9516    ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
9517    ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
9518    // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform
9519    // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
9520    // supported types.
9521    if (CC == ISD::SETGT && RHSC && RHSC->isAllOnes() && CTVal && CFVal &&
9522        CTVal->isOne() && CFVal->isAllOnes() &&
9523        LHS.getValueType() == TVal.getValueType()) {
9524      EVT VT = LHS.getValueType();
9525      SDValue Shift =
9526          DAG.getNode(ISD::SRA, dl, VT, LHS,
9527                      DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
9528      return DAG.getNode(ISD::OR, dl, VT, Shift, DAG.getConstant(1, dl, VT));
9529    }
9530
9531    // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
9532    // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
9533    // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
9534    // Both require less instructions than compare and conditional select.
9535    if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal &&
9536        RHSC && RHSC->isZero() && CFVal && CFVal->isZero() &&
9537        LHS.getValueType() == RHS.getValueType()) {
9538      EVT VT = LHS.getValueType();
9539      SDValue Shift =
9540          DAG.getNode(ISD::SRA, dl, VT, LHS,
9541                      DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
9542
9543      if (CC == ISD::SETGT)
9544        Shift = DAG.getNOT(dl, Shift, VT);
9545
9546      return DAG.getNode(ISD::AND, dl, VT, LHS, Shift);
9547    }
9548
9549    unsigned Opcode = AArch64ISD::CSEL;
9550
9551    // If both the TVal and the FVal are constants, see if we can swap them in
9552    // order to for a CSINV or CSINC out of them.
9553    if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
9554      std::swap(TVal, FVal);
9555      std::swap(CTVal, CFVal);
9556      CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9557    } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
9558      std::swap(TVal, FVal);
9559      std::swap(CTVal, CFVal);
9560      CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9561    } else if (TVal.getOpcode() == ISD::XOR) {
9562      // If TVal is a NOT we want to swap TVal and FVal so that we can match
9563      // with a CSINV rather than a CSEL.
9564      if (isAllOnesConstant(TVal.getOperand(1))) {
9565        std::swap(TVal, FVal);
9566        std::swap(CTVal, CFVal);
9567        CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9568      }
9569    } else if (TVal.getOpcode() == ISD::SUB) {
9570      // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
9571      // that we can match with a CSNEG rather than a CSEL.
9572      if (isNullConstant(TVal.getOperand(0))) {
9573        std::swap(TVal, FVal);
9574        std::swap(CTVal, CFVal);
9575        CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9576      }
9577    } else if (CTVal && CFVal) {
9578      const int64_t TrueVal = CTVal->getSExtValue();
9579      const int64_t FalseVal = CFVal->getSExtValue();
9580      bool Swap = false;
9581
9582      // If both TVal and FVal are constants, see if FVal is the
9583      // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
9584      // instead of a CSEL in that case.
9585      if (TrueVal == ~FalseVal) {
9586        Opcode = AArch64ISD::CSINV;
9587      } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
9588                 TrueVal == -FalseVal) {
9589        Opcode = AArch64ISD::CSNEG;
9590      } else if (TVal.getValueType() == MVT::i32) {
9591        // If our operands are only 32-bit wide, make sure we use 32-bit
9592        // arithmetic for the check whether we can use CSINC. This ensures that
9593        // the addition in the check will wrap around properly in case there is
9594        // an overflow (which would not be the case if we do the check with
9595        // 64-bit arithmetic).
9596        const uint32_t TrueVal32 = CTVal->getZExtValue();
9597        const uint32_t FalseVal32 = CFVal->getZExtValue();
9598
9599        if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
9600          Opcode = AArch64ISD::CSINC;
9601
9602          if (TrueVal32 > FalseVal32) {
9603            Swap = true;
9604          }
9605        }
9606      } else {
9607        // 64-bit check whether we can use CSINC.
9608        const uint64_t TrueVal64 = TrueVal;
9609        const uint64_t FalseVal64 = FalseVal;
9610
9611        if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) {
9612          Opcode = AArch64ISD::CSINC;
9613
9614          if (TrueVal > FalseVal) {
9615            Swap = true;
9616          }
9617        }
9618      }
9619
9620      // Swap TVal and FVal if necessary.
9621      if (Swap) {
9622        std::swap(TVal, FVal);
9623        std::swap(CTVal, CFVal);
9624        CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9625      }
9626
9627      if (Opcode != AArch64ISD::CSEL) {
9628        // Drop FVal since we can get its value by simply inverting/negating
9629        // TVal.
9630        FVal = TVal;
9631      }
9632    }
9633
9634    // Avoid materializing a constant when possible by reusing a known value in
9635    // a register.  However, don't perform this optimization if the known value
9636    // is one, zero or negative one in the case of a CSEL.  We can always
9637    // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
9638    // FVal, respectively.
9639    ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
9640    if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
9641        !RHSVal->isZero() && !RHSVal->isAllOnes()) {
9642      AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
9643      // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
9644      // "a != C ? x : a" to avoid materializing C.
9645      if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
9646        TVal = LHS;
9647      else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
9648        FVal = LHS;
9649    } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
9650      assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
9651      // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
9652      // avoid materializing C.
9653      AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
9654      if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
9655        Opcode = AArch64ISD::CSINV;
9656        TVal = LHS;
9657        FVal = DAG.getConstant(0, dl, FVal.getValueType());
9658      }
9659    }
9660
9661    SDValue CCVal;
9662    SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
9663    EVT VT = TVal.getValueType();
9664    return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
9665  }
9666
9667  // Now we know we're dealing with FP values.
9668  assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
9669         LHS.getValueType() == MVT::f64);
9670  assert(LHS.getValueType() == RHS.getValueType());
9671  EVT VT = TVal.getValueType();
9672  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
9673
9674  // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
9675  // clean.  Some of them require two CSELs to implement.
9676  AArch64CC::CondCode CC1, CC2;
9677  changeFPCCToAArch64CC(CC, CC1, CC2);
9678
9679  if (DAG.getTarget().Options.UnsafeFPMath) {
9680    // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
9681    // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
9682    ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
9683    if (RHSVal && RHSVal->isZero()) {
9684      ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
9685      ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
9686
9687      if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
9688          CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
9689        TVal = LHS;
9690      else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
9691               CFVal && CFVal->isZero() &&
9692               FVal.getValueType() == LHS.getValueType())
9693        FVal = LHS;
9694    }
9695  }
9696
9697  // Emit first, and possibly only, CSEL.
9698  SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9699  SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
9700
9701  // If we need a second CSEL, emit it, using the output of the first as the
9702  // RHS.  We're effectively OR'ing the two CC's together.
9703  if (CC2 != AArch64CC::AL) {
9704    SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
9705    return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
9706  }
9707
9708  // Otherwise, return the output of the first CSEL.
9709  return CS1;
9710}
9711
9712SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
9713                                                  SelectionDAG &DAG) const {
9714  EVT Ty = Op.getValueType();
9715  auto Idx = Op.getConstantOperandAPInt(2);
9716  int64_t IdxVal = Idx.getSExtValue();
9717  assert(Ty.isScalableVector() &&
9718         "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
9719
9720  // We can use the splice instruction for certain index values where we are
9721  // able to efficiently generate the correct predicate. The index will be
9722  // inverted and used directly as the input to the ptrue instruction, i.e.
9723  // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
9724  // splice predicate. However, we can only do this if we can guarantee that
9725  // there are enough elements in the vector, hence we check the index <= min
9726  // number of elements.
9727  std::optional<unsigned> PredPattern;
9728  if (Ty.isScalableVector() && IdxVal < 0 &&
9729      (PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) !=
9730          std::nullopt) {
9731    SDLoc DL(Op);
9732
9733    // Create a predicate where all but the last -IdxVal elements are false.
9734    EVT PredVT = Ty.changeVectorElementType(MVT::i1);
9735    SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern);
9736    Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred);
9737
9738    // Now splice the two inputs together using the predicate.
9739    return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0),
9740                       Op.getOperand(1));
9741  }
9742
9743  // This will select to an EXT instruction, which has a maximum immediate
9744  // value of 255, hence 2048-bits is the maximum value we can lower.
9745  if (IdxVal >= 0 &&
9746      IdxVal < int64_t(2048 / Ty.getVectorElementType().getSizeInBits()))
9747    return Op;
9748
9749  return SDValue();
9750}
9751
9752SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
9753                                              SelectionDAG &DAG) const {
9754  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
9755  SDValue LHS = Op.getOperand(0);
9756  SDValue RHS = Op.getOperand(1);
9757  SDValue TVal = Op.getOperand(2);
9758  SDValue FVal = Op.getOperand(3);
9759  SDLoc DL(Op);
9760  return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
9761}
9762
9763SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
9764                                           SelectionDAG &DAG) const {
9765  SDValue CCVal = Op->getOperand(0);
9766  SDValue TVal = Op->getOperand(1);
9767  SDValue FVal = Op->getOperand(2);
9768  SDLoc DL(Op);
9769
9770  EVT Ty = Op.getValueType();
9771  if (Ty == MVT::aarch64svcount) {
9772    TVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, TVal);
9773    FVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, FVal);
9774    SDValue Sel =
9775        DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal);
9776    return DAG.getNode(ISD::BITCAST, DL, Ty, Sel);
9777  }
9778
9779  if (Ty.isScalableVector()) {
9780    MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
9781    SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, CCVal);
9782    return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
9783  }
9784
9785  if (useSVEForFixedLengthVectorVT(Ty, !Subtarget->isNeonAvailable())) {
9786    // FIXME: Ideally this would be the same as above using i1 types, however
9787    // for the moment we can't deal with fixed i1 vector types properly, so
9788    // instead extend the predicate to a result type sized integer vector.
9789    MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());
9790    MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());
9791    SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT);
9792    SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal);
9793    return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
9794  }
9795
9796  // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
9797  // instruction.
9798  if (ISD::isOverflowIntrOpRes(CCVal)) {
9799    // Only lower legal XALUO ops.
9800    if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
9801      return SDValue();
9802
9803    AArch64CC::CondCode OFCC;
9804    SDValue Value, Overflow;
9805    std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
9806    SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);
9807
9808    return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
9809                       CCVal, Overflow);
9810  }
9811
9812  // Lower it the same way as we would lower a SELECT_CC node.
9813  ISD::CondCode CC;
9814  SDValue LHS, RHS;
9815  if (CCVal.getOpcode() == ISD::SETCC) {
9816    LHS = CCVal.getOperand(0);
9817    RHS = CCVal.getOperand(1);
9818    CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();
9819  } else {
9820    LHS = CCVal;
9821    RHS = DAG.getConstant(0, DL, CCVal.getValueType());
9822    CC = ISD::SETNE;
9823  }
9824
9825  // If we are lowering a f16 and we do not have fullf16, convert to a f32 in
9826  // order to use FCSELSrrr
9827  if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
9828    TVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
9829                                     DAG.getUNDEF(MVT::f32), TVal);
9830    FVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
9831                                     DAG.getUNDEF(MVT::f32), FVal);
9832  }
9833
9834  SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
9835
9836  if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
9837    return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
9838  }
9839
9840  return Res;
9841}
9842
9843SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
9844                                              SelectionDAG &DAG) const {
9845  // Jump table entries as PC relative offsets. No additional tweaking
9846  // is necessary here. Just get the address of the jump table.
9847  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
9848
9849  CodeModel::Model CM = getTargetMachine().getCodeModel();
9850  if (CM == CodeModel::Large && !getTargetMachine().isPositionIndependent() &&
9851      !Subtarget->isTargetMachO())
9852    return getAddrLarge(JT, DAG);
9853  if (CM == CodeModel::Tiny)
9854    return getAddrTiny(JT, DAG);
9855  return getAddr(JT, DAG);
9856}
9857
9858SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
9859                                          SelectionDAG &DAG) const {
9860  // Jump table entries as PC relative offsets. No additional tweaking
9861  // is necessary here. Just get the address of the jump table.
9862  SDLoc DL(Op);
9863  SDValue JT = Op.getOperand(1);
9864  SDValue Entry = Op.getOperand(2);
9865  int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
9866
9867  auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
9868  AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
9869
9870  SDNode *Dest =
9871      DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
9872                         Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
9873  SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Op.getOperand(0), DL);
9874  return DAG.getNode(ISD::BRIND, DL, MVT::Other, JTInfo, SDValue(Dest, 0));
9875}
9876
9877SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
9878                                                 SelectionDAG &DAG) const {
9879  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
9880  CodeModel::Model CM = getTargetMachine().getCodeModel();
9881  if (CM == CodeModel::Large) {
9882    // Use the GOT for the large code model on iOS.
9883    if (Subtarget->isTargetMachO()) {
9884      return getGOT(CP, DAG);
9885    }
9886    if (!getTargetMachine().isPositionIndependent())
9887      return getAddrLarge(CP, DAG);
9888  } else if (CM == CodeModel::Tiny) {
9889    return getAddrTiny(CP, DAG);
9890  }
9891  return getAddr(CP, DAG);
9892}
9893
9894SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
9895                                               SelectionDAG &DAG) const {
9896  BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op);
9897  CodeModel::Model CM = getTargetMachine().getCodeModel();
9898  if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
9899    if (!getTargetMachine().isPositionIndependent())
9900      return getAddrLarge(BA, DAG);
9901  } else if (CM == CodeModel::Tiny) {
9902    return getAddrTiny(BA, DAG);
9903  }
9904  return getAddr(BA, DAG);
9905}
9906
9907SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
9908                                                 SelectionDAG &DAG) const {
9909  AArch64FunctionInfo *FuncInfo =
9910      DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
9911
9912  SDLoc DL(Op);
9913  SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
9914                                 getPointerTy(DAG.getDataLayout()));
9915  FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
9916  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
9917  return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
9918                      MachinePointerInfo(SV));
9919}
9920
9921SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
9922                                                  SelectionDAG &DAG) const {
9923  MachineFunction &MF = DAG.getMachineFunction();
9924  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9925
9926  SDLoc DL(Op);
9927  SDValue FR;
9928  if (Subtarget->isWindowsArm64EC()) {
9929    // With the Arm64EC ABI, we compute the address of the varargs save area
9930    // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
9931    // but calls from an entry thunk can pass in a different address.
9932    Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
9933    SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64);
9934    uint64_t StackOffset;
9935    if (FuncInfo->getVarArgsGPRSize() > 0)
9936      StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
9937    else
9938      StackOffset = FuncInfo->getVarArgsStackOffset();
9939    FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
9940                     DAG.getConstant(StackOffset, DL, MVT::i64));
9941  } else {
9942    FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
9943                               ? FuncInfo->getVarArgsGPRIndex()
9944                               : FuncInfo->getVarArgsStackIndex(),
9945                           getPointerTy(DAG.getDataLayout()));
9946  }
9947  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
9948  return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
9949                      MachinePointerInfo(SV));
9950}
9951
9952SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
9953                                                  SelectionDAG &DAG) const {
9954  // The layout of the va_list struct is specified in the AArch64 Procedure Call
9955  // Standard, section B.3.
9956  MachineFunction &MF = DAG.getMachineFunction();
9957  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9958  unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
9959  auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
9960  auto PtrVT = getPointerTy(DAG.getDataLayout());
9961  SDLoc DL(Op);
9962
9963  SDValue Chain = Op.getOperand(0);
9964  SDValue VAList = Op.getOperand(1);
9965  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
9966  SmallVector<SDValue, 4> MemOps;
9967
9968  // void *__stack at offset 0
9969  unsigned Offset = 0;
9970  SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
9971  Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
9972  MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
9973                                MachinePointerInfo(SV), Align(PtrSize)));
9974
9975  // void *__gr_top at offset 8 (4 on ILP32)
9976  Offset += PtrSize;
9977  int GPRSize = FuncInfo->getVarArgsGPRSize();
9978  if (GPRSize > 0) {
9979    SDValue GRTop, GRTopAddr;
9980
9981    GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
9982                            DAG.getConstant(Offset, DL, PtrVT));
9983
9984    GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
9985    GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
9986                        DAG.getConstant(GPRSize, DL, PtrVT));
9987    GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
9988
9989    MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
9990                                  MachinePointerInfo(SV, Offset),
9991                                  Align(PtrSize)));
9992  }
9993
9994  // void *__vr_top at offset 16 (8 on ILP32)
9995  Offset += PtrSize;
9996  int FPRSize = FuncInfo->getVarArgsFPRSize();
9997  if (FPRSize > 0) {
9998    SDValue VRTop, VRTopAddr;
9999    VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10000                            DAG.getConstant(Offset, DL, PtrVT));
10001
10002    VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
10003    VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
10004                        DAG.getConstant(FPRSize, DL, PtrVT));
10005    VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
10006
10007    MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
10008                                  MachinePointerInfo(SV, Offset),
10009                                  Align(PtrSize)));
10010  }
10011
10012  // int __gr_offs at offset 24 (12 on ILP32)
10013  Offset += PtrSize;
10014  SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10015                                   DAG.getConstant(Offset, DL, PtrVT));
10016  MemOps.push_back(
10017      DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32),
10018                   GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
10019
10020  // int __vr_offs at offset 28 (16 on ILP32)
10021  Offset += 4;
10022  SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10023                                   DAG.getConstant(Offset, DL, PtrVT));
10024  MemOps.push_back(
10025      DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32),
10026                   VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
10027
10028  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
10029}
10030
10031SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
10032                                            SelectionDAG &DAG) const {
10033  MachineFunction &MF = DAG.getMachineFunction();
10034
10035  if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()))
10036    return LowerWin64_VASTART(Op, DAG);
10037  else if (Subtarget->isTargetDarwin())
10038    return LowerDarwin_VASTART(Op, DAG);
10039  else
10040    return LowerAAPCS_VASTART(Op, DAG);
10041}
10042
10043SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
10044                                           SelectionDAG &DAG) const {
10045  // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
10046  // pointer.
10047  SDLoc DL(Op);
10048  unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
10049  unsigned VaListSize =
10050      (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
10051          ? PtrSize
10052          : Subtarget->isTargetILP32() ? 20 : 32;
10053  const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
10054  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
10055
10056  return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
10057                       DAG.getConstant(VaListSize, DL, MVT::i32),
10058                       Align(PtrSize), false, false, false,
10059                       MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
10060}
10061
10062SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
10063  assert(Subtarget->isTargetDarwin() &&
10064         "automatic va_arg instruction only works on Darwin");
10065
10066  const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10067  EVT VT = Op.getValueType();
10068  SDLoc DL(Op);
10069  SDValue Chain = Op.getOperand(0);
10070  SDValue Addr = Op.getOperand(1);
10071  MaybeAlign Align(Op.getConstantOperandVal(3));
10072  unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
10073  auto PtrVT = getPointerTy(DAG.getDataLayout());
10074  auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10075  SDValue VAList =
10076      DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
10077  Chain = VAList.getValue(1);
10078  VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
10079
10080  if (VT.isScalableVector())
10081    report_fatal_error("Passing SVE types to variadic functions is "
10082                       "currently not supported");
10083
10084  if (Align && *Align > MinSlotSize) {
10085    VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10086                         DAG.getConstant(Align->value() - 1, DL, PtrVT));
10087    VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
10088                         DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT));
10089  }
10090
10091  Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
10092  unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
10093
10094  // Scalar integer and FP values smaller than 64 bits are implicitly extended
10095  // up to 64 bits.  At the very least, we have to increase the striding of the
10096  // vaargs list to match this, and for FP values we need to introduce
10097  // FP_ROUND nodes as well.
10098  if (VT.isInteger() && !VT.isVector())
10099    ArgSize = std::max(ArgSize, MinSlotSize);
10100  bool NeedFPTrunc = false;
10101  if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
10102    ArgSize = 8;
10103    NeedFPTrunc = true;
10104  }
10105
10106  // Increment the pointer, VAList, to the next vaarg
10107  SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10108                               DAG.getConstant(ArgSize, DL, PtrVT));
10109  VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
10110
10111  // Store the incremented VAList to the legalized pointer
10112  SDValue APStore =
10113      DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
10114
10115  // Load the actual argument out of the pointer VAList
10116  if (NeedFPTrunc) {
10117    // Load the value as an f64.
10118    SDValue WideFP =
10119        DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
10120    // Round the value down to an f32.
10121    SDValue NarrowFP =
10122        DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
10123                    DAG.getIntPtrConstant(1, DL, /*isTarget=*/true));
10124    SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
10125    // Merge the rounded value with the chain output of the load.
10126    return DAG.getMergeValues(Ops, DL);
10127  }
10128
10129  return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
10130}
10131
10132SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
10133                                              SelectionDAG &DAG) const {
10134  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
10135  MFI.setFrameAddressIsTaken(true);
10136
10137  EVT VT = Op.getValueType();
10138  SDLoc DL(Op);
10139  unsigned Depth = Op.getConstantOperandVal(0);
10140  SDValue FrameAddr =
10141      DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
10142  while (Depth--)
10143    FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
10144                            MachinePointerInfo());
10145
10146  if (Subtarget->isTargetILP32())
10147    FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
10148                            DAG.getValueType(VT));
10149
10150  return FrameAddr;
10151}
10152
10153SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
10154                                              SelectionDAG &DAG) const {
10155  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
10156
10157  EVT VT = getPointerTy(DAG.getDataLayout());
10158  SDLoc DL(Op);
10159  int FI = MFI.CreateFixedObject(4, 0, false);
10160  return DAG.getFrameIndex(FI, VT);
10161}
10162
10163#define GET_REGISTER_MATCHER
10164#include "AArch64GenAsmMatcher.inc"
10165
10166// FIXME? Maybe this could be a TableGen attribute on some registers and
10167// this table could be generated automatically from RegInfo.
10168Register AArch64TargetLowering::
10169getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
10170  Register Reg = MatchRegisterName(RegName);
10171  if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
10172    const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo();
10173    unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
10174    if (!Subtarget->isXRegisterReserved(DwarfRegNum) &&
10175        !MRI->isReservedReg(MF, Reg))
10176      Reg = 0;
10177  }
10178  if (Reg)
10179    return Reg;
10180  report_fatal_error(Twine("Invalid register name \""
10181                              + StringRef(RegName)  + "\"."));
10182}
10183
10184SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
10185                                                     SelectionDAG &DAG) const {
10186  DAG.getMachineFunction().getFrameInfo().setFrameAddressIsTaken(true);
10187
10188  EVT VT = Op.getValueType();
10189  SDLoc DL(Op);
10190
10191  SDValue FrameAddr =
10192      DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
10193  SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
10194
10195  return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
10196}
10197
10198SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
10199                                               SelectionDAG &DAG) const {
10200  MachineFunction &MF = DAG.getMachineFunction();
10201  MachineFrameInfo &MFI = MF.getFrameInfo();
10202  MFI.setReturnAddressIsTaken(true);
10203
10204  EVT VT = Op.getValueType();
10205  SDLoc DL(Op);
10206  unsigned Depth = Op.getConstantOperandVal(0);
10207  SDValue ReturnAddress;
10208  if (Depth) {
10209    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
10210    SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
10211    ReturnAddress = DAG.getLoad(
10212        VT, DL, DAG.getEntryNode(),
10213        DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
10214  } else {
10215    // Return LR, which contains the return address. Mark it an implicit
10216    // live-in.
10217    Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
10218    ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
10219  }
10220
10221  // The XPACLRI instruction assembles to a hint-space instruction before
10222  // Armv8.3-A therefore this instruction can be safely used for any pre
10223  // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
10224  // that instead.
10225  SDNode *St;
10226  if (Subtarget->hasPAuth()) {
10227    St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
10228  } else {
10229    // XPACLRI operates on LR therefore we must move the operand accordingly.
10230    SDValue Chain =
10231        DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
10232    St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
10233  }
10234  return SDValue(St, 0);
10235}
10236
10237/// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
10238/// i32 values and take a 2 x i32 value to shift plus a shift amount.
10239SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
10240                                               SelectionDAG &DAG) const {
10241  SDValue Lo, Hi;
10242  expandShiftParts(Op.getNode(), Lo, Hi, DAG);
10243  return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
10244}
10245
10246bool AArch64TargetLowering::isOffsetFoldingLegal(
10247    const GlobalAddressSDNode *GA) const {
10248  // Offsets are folded in the DAG combine rather than here so that we can
10249  // intelligently choose an offset based on the uses.
10250  return false;
10251}
10252
10253bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
10254                                         bool OptForSize) const {
10255  bool IsLegal = false;
10256  // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
10257  // 16-bit case when target has full fp16 support.
10258  // FIXME: We should be able to handle f128 as well with a clever lowering.
10259  const APInt ImmInt = Imm.bitcastToAPInt();
10260  if (VT == MVT::f64)
10261    IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
10262  else if (VT == MVT::f32)
10263    IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
10264  else if (VT == MVT::f16 || VT == MVT::bf16)
10265    IsLegal =
10266        (Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(ImmInt) != -1) ||
10267        Imm.isPosZero();
10268
10269  // If we can not materialize in immediate field for fmov, check if the
10270  // value can be encoded as the immediate operand of a logical instruction.
10271  // The immediate value will be created with either MOVZ, MOVN, or ORR.
10272  // TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to
10273  //       generate that fmov.
10274  if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
10275    // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
10276    // however the mov+fmov sequence is always better because of the reduced
10277    // cache pressure. The timings are still the same if you consider
10278    // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
10279    // movw+movk is fused). So we limit up to 2 instrdduction at most.
10280    SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
10281    AArch64_IMM::expandMOVImm(ImmInt.getZExtValue(), VT.getSizeInBits(), Insn);
10282    unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2));
10283    IsLegal = Insn.size() <= Limit;
10284  }
10285
10286  LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT
10287                    << " imm value: "; Imm.dump(););
10288  return IsLegal;
10289}
10290
10291//===----------------------------------------------------------------------===//
10292//                          AArch64 Optimization Hooks
10293//===----------------------------------------------------------------------===//
10294
10295static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
10296                           SDValue Operand, SelectionDAG &DAG,
10297                           int &ExtraSteps) {
10298  EVT VT = Operand.getValueType();
10299  if ((ST->hasNEON() &&
10300       (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
10301        VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||
10302        VT == MVT::v4f32)) ||
10303      (ST->hasSVE() &&
10304       (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
10305    if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified)
10306      // For the reciprocal estimates, convergence is quadratic, so the number
10307      // of digits is doubled after each iteration.  In ARMv8, the accuracy of
10308      // the initial estimate is 2^-8.  Thus the number of extra steps to refine
10309      // the result for float (23 mantissa bits) is 2 and for double (52
10310      // mantissa bits) is 3.
10311      ExtraSteps = VT.getScalarType() == MVT::f64 ? 3 : 2;
10312
10313    return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
10314  }
10315
10316  return SDValue();
10317}
10318
10319SDValue
10320AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
10321                                        const DenormalMode &Mode) const {
10322  SDLoc DL(Op);
10323  EVT VT = Op.getValueType();
10324  EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
10325  SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
10326  return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
10327}
10328
10329SDValue
10330AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
10331                                                   SelectionDAG &DAG) const {
10332  return Op;
10333}
10334
10335SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
10336                                               SelectionDAG &DAG, int Enabled,
10337                                               int &ExtraSteps,
10338                                               bool &UseOneConst,
10339                                               bool Reciprocal) const {
10340  if (Enabled == ReciprocalEstimate::Enabled ||
10341      (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
10342    if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
10343                                       DAG, ExtraSteps)) {
10344      SDLoc DL(Operand);
10345      EVT VT = Operand.getValueType();
10346
10347      SDNodeFlags Flags;
10348      Flags.setAllowReassociation(true);
10349
10350      // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
10351      // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
10352      for (int i = ExtraSteps; i > 0; --i) {
10353        SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
10354                                   Flags);
10355        Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
10356        Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
10357      }
10358      if (!Reciprocal)
10359        Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
10360
10361      ExtraSteps = 0;
10362      return Estimate;
10363    }
10364
10365  return SDValue();
10366}
10367
10368SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
10369                                                SelectionDAG &DAG, int Enabled,
10370                                                int &ExtraSteps) const {
10371  if (Enabled == ReciprocalEstimate::Enabled)
10372    if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
10373                                       DAG, ExtraSteps)) {
10374      SDLoc DL(Operand);
10375      EVT VT = Operand.getValueType();
10376
10377      SDNodeFlags Flags;
10378      Flags.setAllowReassociation(true);
10379
10380      // Newton reciprocal iteration: E * (2 - X * E)
10381      // AArch64 reciprocal iteration instruction: (2 - M * N)
10382      for (int i = ExtraSteps; i > 0; --i) {
10383        SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
10384                                   Estimate, Flags);
10385        Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
10386      }
10387
10388      ExtraSteps = 0;
10389      return Estimate;
10390    }
10391
10392  return SDValue();
10393}
10394
10395//===----------------------------------------------------------------------===//
10396//                          AArch64 Inline Assembly Support
10397//===----------------------------------------------------------------------===//
10398
10399// Table of Constraints
10400// TODO: This is the current set of constraints supported by ARM for the
10401// compiler, not all of them may make sense.
10402//
10403// r - A general register
10404// w - An FP/SIMD register of some size in the range v0-v31
10405// x - An FP/SIMD register of some size in the range v0-v15
10406// I - Constant that can be used with an ADD instruction
10407// J - Constant that can be used with a SUB instruction
10408// K - Constant that can be used with a 32-bit logical instruction
10409// L - Constant that can be used with a 64-bit logical instruction
10410// M - Constant that can be used as a 32-bit MOV immediate
10411// N - Constant that can be used as a 64-bit MOV immediate
10412// Q - A memory reference with base register and no offset
10413// S - A symbolic address
10414// Y - Floating point constant zero
10415// Z - Integer constant zero
10416//
10417//   Note that general register operands will be output using their 64-bit x
10418// register name, whatever the size of the variable, unless the asm operand
10419// is prefixed by the %w modifier. Floating-point and SIMD register operands
10420// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
10421// %q modifier.
10422const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
10423  // At this point, we have to lower this constraint to something else, so we
10424  // lower it to an "r" or "w". However, by doing this we will force the result
10425  // to be in register, while the X constraint is much more permissive.
10426  //
10427  // Although we are correct (we are free to emit anything, without
10428  // constraints), we might break use cases that would expect us to be more
10429  // efficient and emit something else.
10430  if (!Subtarget->hasFPARMv8())
10431    return "r";
10432
10433  if (ConstraintVT.isFloatingPoint())
10434    return "w";
10435
10436  if (ConstraintVT.isVector() &&
10437     (ConstraintVT.getSizeInBits() == 64 ||
10438      ConstraintVT.getSizeInBits() == 128))
10439    return "w";
10440
10441  return "r";
10442}
10443
10444enum class PredicateConstraint { Uph, Upl, Upa };
10445
10446static std::optional<PredicateConstraint>
10447parsePredicateConstraint(StringRef Constraint) {
10448  return StringSwitch<std::optional<PredicateConstraint>>(Constraint)
10449      .Case("Uph", PredicateConstraint::Uph)
10450      .Case("Upl", PredicateConstraint::Upl)
10451      .Case("Upa", PredicateConstraint::Upa)
10452      .Default(std::nullopt);
10453}
10454
10455static const TargetRegisterClass *
10456getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT) {
10457  if (VT != MVT::aarch64svcount &&
10458      (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1))
10459    return nullptr;
10460
10461  switch (Constraint) {
10462  case PredicateConstraint::Uph:
10463    return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass
10464                                     : &AArch64::PPR_p8to15RegClass;
10465  case PredicateConstraint::Upl:
10466    return VT == MVT::aarch64svcount ? &AArch64::PNR_3bRegClass
10467                                     : &AArch64::PPR_3bRegClass;
10468  case PredicateConstraint::Upa:
10469    return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass
10470                                     : &AArch64::PPRRegClass;
10471  }
10472
10473  llvm_unreachable("Missing PredicateConstraint!");
10474}
10475
10476enum class ReducedGprConstraint { Uci, Ucj };
10477
10478static std::optional<ReducedGprConstraint>
10479parseReducedGprConstraint(StringRef Constraint) {
10480  return StringSwitch<std::optional<ReducedGprConstraint>>(Constraint)
10481      .Case("Uci", ReducedGprConstraint::Uci)
10482      .Case("Ucj", ReducedGprConstraint::Ucj)
10483      .Default(std::nullopt);
10484}
10485
10486static const TargetRegisterClass *
10487getReducedGprRegisterClass(ReducedGprConstraint Constraint, EVT VT) {
10488  if (!VT.isScalarInteger() || VT.getFixedSizeInBits() > 64)
10489    return nullptr;
10490
10491  switch (Constraint) {
10492  case ReducedGprConstraint::Uci:
10493    return &AArch64::MatrixIndexGPR32_8_11RegClass;
10494  case ReducedGprConstraint::Ucj:
10495    return &AArch64::MatrixIndexGPR32_12_15RegClass;
10496  }
10497
10498  llvm_unreachable("Missing ReducedGprConstraint!");
10499}
10500
10501// The set of cc code supported is from
10502// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands
10503static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint) {
10504  AArch64CC::CondCode Cond = StringSwitch<AArch64CC::CondCode>(Constraint)
10505                                 .Case("{@cchi}", AArch64CC::HI)
10506                                 .Case("{@cccs}", AArch64CC::HS)
10507                                 .Case("{@cclo}", AArch64CC::LO)
10508                                 .Case("{@ccls}", AArch64CC::LS)
10509                                 .Case("{@cccc}", AArch64CC::LO)
10510                                 .Case("{@cceq}", AArch64CC::EQ)
10511                                 .Case("{@ccgt}", AArch64CC::GT)
10512                                 .Case("{@ccge}", AArch64CC::GE)
10513                                 .Case("{@cclt}", AArch64CC::LT)
10514                                 .Case("{@ccle}", AArch64CC::LE)
10515                                 .Case("{@cchs}", AArch64CC::HS)
10516                                 .Case("{@ccne}", AArch64CC::NE)
10517                                 .Case("{@ccvc}", AArch64CC::VC)
10518                                 .Case("{@ccpl}", AArch64CC::PL)
10519                                 .Case("{@ccvs}", AArch64CC::VS)
10520                                 .Case("{@ccmi}", AArch64CC::MI)
10521                                 .Default(AArch64CC::Invalid);
10522  return Cond;
10523}
10524
10525/// Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR,
10526/// WZR, invert(<cond>)'.
10527static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL,
10528                        SelectionDAG &DAG) {
10529  return DAG.getNode(
10530      AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
10531      DAG.getConstant(0, DL, MVT::i32),
10532      DAG.getConstant(getInvertedCondCode(CC), DL, MVT::i32), NZCV);
10533}
10534
10535// Lower @cc flag output via getSETCC.
10536SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
10537    SDValue &Chain, SDValue &Glue, const SDLoc &DL,
10538    const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
10539  AArch64CC::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
10540  if (Cond == AArch64CC::Invalid)
10541    return SDValue();
10542  // The output variable should be a scalar integer.
10543  if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
10544      OpInfo.ConstraintVT.getSizeInBits() < 8)
10545    report_fatal_error("Flag output operand is of invalid type");
10546
10547  // Get NZCV register. Only update chain when copyfrom is glued.
10548  if (Glue.getNode()) {
10549    Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32, Glue);
10550    Chain = Glue.getValue(1);
10551  } else
10552    Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32);
10553  // Extract CC code.
10554  SDValue CC = getSETCC(Cond, Glue, DL, DAG);
10555
10556  SDValue Result;
10557
10558  // Truncate or ZERO_EXTEND based on value types.
10559  if (OpInfo.ConstraintVT.getSizeInBits() <= 32)
10560    Result = DAG.getNode(ISD::TRUNCATE, DL, OpInfo.ConstraintVT, CC);
10561  else
10562    Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
10563
10564  return Result;
10565}
10566
10567/// getConstraintType - Given a constraint letter, return the type of
10568/// constraint it is for this target.
10569AArch64TargetLowering::ConstraintType
10570AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
10571  if (Constraint.size() == 1) {
10572    switch (Constraint[0]) {
10573    default:
10574      break;
10575    case 'x':
10576    case 'w':
10577    case 'y':
10578      return C_RegisterClass;
10579    // An address with a single base register. Due to the way we
10580    // currently handle addresses it is the same as 'r'.
10581    case 'Q':
10582      return C_Memory;
10583    case 'I':
10584    case 'J':
10585    case 'K':
10586    case 'L':
10587    case 'M':
10588    case 'N':
10589    case 'Y':
10590    case 'Z':
10591      return C_Immediate;
10592    case 'z':
10593    case 'S': // A symbolic address
10594      return C_Other;
10595    }
10596  } else if (parsePredicateConstraint(Constraint))
10597    return C_RegisterClass;
10598  else if (parseReducedGprConstraint(Constraint))
10599    return C_RegisterClass;
10600  else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)
10601    return C_Other;
10602  return TargetLowering::getConstraintType(Constraint);
10603}
10604
10605/// Examine constraint type and operand type and determine a weight value.
10606/// This object must already have been set up with the operand type
10607/// and the current alternative constraint selected.
10608TargetLowering::ConstraintWeight
10609AArch64TargetLowering::getSingleConstraintMatchWeight(
10610    AsmOperandInfo &info, const char *constraint) const {
10611  ConstraintWeight weight = CW_Invalid;
10612  Value *CallOperandVal = info.CallOperandVal;
10613  // If we don't have a value, we can't do a match,
10614  // but allow it at the lowest weight.
10615  if (!CallOperandVal)
10616    return CW_Default;
10617  Type *type = CallOperandVal->getType();
10618  // Look at the constraint type.
10619  switch (*constraint) {
10620  default:
10621    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
10622    break;
10623  case 'x':
10624  case 'w':
10625  case 'y':
10626    if (type->isFloatingPointTy() || type->isVectorTy())
10627      weight = CW_Register;
10628    break;
10629  case 'z':
10630    weight = CW_Constant;
10631    break;
10632  case 'U':
10633    if (parsePredicateConstraint(constraint) ||
10634        parseReducedGprConstraint(constraint))
10635      weight = CW_Register;
10636    break;
10637  }
10638  return weight;
10639}
10640
10641std::pair<unsigned, const TargetRegisterClass *>
10642AArch64TargetLowering::getRegForInlineAsmConstraint(
10643    const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
10644  if (Constraint.size() == 1) {
10645    switch (Constraint[0]) {
10646    case 'r':
10647      if (VT.isScalableVector())
10648        return std::make_pair(0U, nullptr);
10649      if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
10650        return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
10651      if (VT.getFixedSizeInBits() == 64)
10652        return std::make_pair(0U, &AArch64::GPR64commonRegClass);
10653      return std::make_pair(0U, &AArch64::GPR32commonRegClass);
10654    case 'w': {
10655      if (!Subtarget->hasFPARMv8())
10656        break;
10657      if (VT.isScalableVector()) {
10658        if (VT.getVectorElementType() != MVT::i1)
10659          return std::make_pair(0U, &AArch64::ZPRRegClass);
10660        return std::make_pair(0U, nullptr);
10661      }
10662      uint64_t VTSize = VT.getFixedSizeInBits();
10663      if (VTSize == 16)
10664        return std::make_pair(0U, &AArch64::FPR16RegClass);
10665      if (VTSize == 32)
10666        return std::make_pair(0U, &AArch64::FPR32RegClass);
10667      if (VTSize == 64)
10668        return std::make_pair(0U, &AArch64::FPR64RegClass);
10669      if (VTSize == 128)
10670        return std::make_pair(0U, &AArch64::FPR128RegClass);
10671      break;
10672    }
10673    // The instructions that this constraint is designed for can
10674    // only take 128-bit registers so just use that regclass.
10675    case 'x':
10676      if (!Subtarget->hasFPARMv8())
10677        break;
10678      if (VT.isScalableVector())
10679        return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
10680      if (VT.getSizeInBits() == 128)
10681        return std::make_pair(0U, &AArch64::FPR128_loRegClass);
10682      break;
10683    case 'y':
10684      if (!Subtarget->hasFPARMv8())
10685        break;
10686      if (VT.isScalableVector())
10687        return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
10688      break;
10689    }
10690  } else {
10691    if (const auto PC = parsePredicateConstraint(Constraint))
10692      if (const auto *RegClass = getPredicateRegisterClass(*PC, VT))
10693        return std::make_pair(0U, RegClass);
10694
10695    if (const auto RGC = parseReducedGprConstraint(Constraint))
10696      if (const auto *RegClass = getReducedGprRegisterClass(*RGC, VT))
10697        return std::make_pair(0U, RegClass);
10698  }
10699  if (StringRef("{cc}").equals_insensitive(Constraint) ||
10700      parseConstraintCode(Constraint) != AArch64CC::Invalid)
10701    return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
10702
10703  if (Constraint == "{za}") {
10704    return std::make_pair(unsigned(AArch64::ZA), &AArch64::MPRRegClass);
10705  }
10706
10707  if (Constraint == "{zt0}") {
10708    return std::make_pair(unsigned(AArch64::ZT0), &AArch64::ZTRRegClass);
10709  }
10710
10711  // Use the default implementation in TargetLowering to convert the register
10712  // constraint into a member of a register class.
10713  std::pair<unsigned, const TargetRegisterClass *> Res;
10714  Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
10715
10716  // Not found as a standard register?
10717  if (!Res.second) {
10718    unsigned Size = Constraint.size();
10719    if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
10720        tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
10721      int RegNo;
10722      bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
10723      if (!Failed && RegNo >= 0 && RegNo <= 31) {
10724        // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
10725        // By default we'll emit v0-v31 for this unless there's a modifier where
10726        // we'll emit the correct register as well.
10727        if (VT != MVT::Other && VT.getSizeInBits() == 64) {
10728          Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
10729          Res.second = &AArch64::FPR64RegClass;
10730        } else {
10731          Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
10732          Res.second = &AArch64::FPR128RegClass;
10733        }
10734      }
10735    }
10736  }
10737
10738  if (Res.second && !Subtarget->hasFPARMv8() &&
10739      !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
10740      !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
10741    return std::make_pair(0U, nullptr);
10742
10743  return Res;
10744}
10745
10746EVT AArch64TargetLowering::getAsmOperandValueType(const DataLayout &DL,
10747                                                  llvm::Type *Ty,
10748                                                  bool AllowUnknown) const {
10749  if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
10750    return EVT(MVT::i64x8);
10751
10752  return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
10753}
10754
10755/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
10756/// vector.  If it is invalid, don't add anything to Ops.
10757void AArch64TargetLowering::LowerAsmOperandForConstraint(
10758    SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
10759    SelectionDAG &DAG) const {
10760  SDValue Result;
10761
10762  // Currently only support length 1 constraints.
10763  if (Constraint.size() != 1)
10764    return;
10765
10766  char ConstraintLetter = Constraint[0];
10767  switch (ConstraintLetter) {
10768  default:
10769    break;
10770
10771  // This set of constraints deal with valid constants for various instructions.
10772  // Validate and return a target constant for them if we can.
10773  case 'z': {
10774    // 'z' maps to xzr or wzr so it needs an input of 0.
10775    if (!isNullConstant(Op))
10776      return;
10777
10778    if (Op.getValueType() == MVT::i64)
10779      Result = DAG.getRegister(AArch64::XZR, MVT::i64);
10780    else
10781      Result = DAG.getRegister(AArch64::WZR, MVT::i32);
10782    break;
10783  }
10784  case 'S': {
10785    // An absolute symbolic address or label reference.
10786    if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) {
10787      Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
10788                                          GA->getValueType(0));
10789    } else if (const BlockAddressSDNode *BA =
10790                   dyn_cast<BlockAddressSDNode>(Op)) {
10791      Result =
10792          DAG.getTargetBlockAddress(BA->getBlockAddress(), BA->getValueType(0));
10793    } else
10794      return;
10795    break;
10796  }
10797
10798  case 'I':
10799  case 'J':
10800  case 'K':
10801  case 'L':
10802  case 'M':
10803  case 'N':
10804    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
10805    if (!C)
10806      return;
10807
10808    // Grab the value and do some validation.
10809    uint64_t CVal = C->getZExtValue();
10810    switch (ConstraintLetter) {
10811    // The I constraint applies only to simple ADD or SUB immediate operands:
10812    // i.e. 0 to 4095 with optional shift by 12
10813    // The J constraint applies only to ADD or SUB immediates that would be
10814    // valid when negated, i.e. if [an add pattern] were to be output as a SUB
10815    // instruction [or vice versa], in other words -1 to -4095 with optional
10816    // left shift by 12.
10817    case 'I':
10818      if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
10819        break;
10820      return;
10821    case 'J': {
10822      uint64_t NVal = -C->getSExtValue();
10823      if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
10824        CVal = C->getSExtValue();
10825        break;
10826      }
10827      return;
10828    }
10829    // The K and L constraints apply *only* to logical immediates, including
10830    // what used to be the MOVI alias for ORR (though the MOVI alias has now
10831    // been removed and MOV should be used). So these constraints have to
10832    // distinguish between bit patterns that are valid 32-bit or 64-bit
10833    // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
10834    // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
10835    // versa.
10836    case 'K':
10837      if (AArch64_AM::isLogicalImmediate(CVal, 32))
10838        break;
10839      return;
10840    case 'L':
10841      if (AArch64_AM::isLogicalImmediate(CVal, 64))
10842        break;
10843      return;
10844    // The M and N constraints are a superset of K and L respectively, for use
10845    // with the MOV (immediate) alias. As well as the logical immediates they
10846    // also match 32 or 64-bit immediates that can be loaded either using a
10847    // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
10848    // (M) or 64-bit 0x1234000000000000 (N) etc.
10849    // As a note some of this code is liberally stolen from the asm parser.
10850    case 'M': {
10851      if (!isUInt<32>(CVal))
10852        return;
10853      if (AArch64_AM::isLogicalImmediate(CVal, 32))
10854        break;
10855      if ((CVal & 0xFFFF) == CVal)
10856        break;
10857      if ((CVal & 0xFFFF0000ULL) == CVal)
10858        break;
10859      uint64_t NCVal = ~(uint32_t)CVal;
10860      if ((NCVal & 0xFFFFULL) == NCVal)
10861        break;
10862      if ((NCVal & 0xFFFF0000ULL) == NCVal)
10863        break;
10864      return;
10865    }
10866    case 'N': {
10867      if (AArch64_AM::isLogicalImmediate(CVal, 64))
10868        break;
10869      if ((CVal & 0xFFFFULL) == CVal)
10870        break;
10871      if ((CVal & 0xFFFF0000ULL) == CVal)
10872        break;
10873      if ((CVal & 0xFFFF00000000ULL) == CVal)
10874        break;
10875      if ((CVal & 0xFFFF000000000000ULL) == CVal)
10876        break;
10877      uint64_t NCVal = ~CVal;
10878      if ((NCVal & 0xFFFFULL) == NCVal)
10879        break;
10880      if ((NCVal & 0xFFFF0000ULL) == NCVal)
10881        break;
10882      if ((NCVal & 0xFFFF00000000ULL) == NCVal)
10883        break;
10884      if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
10885        break;
10886      return;
10887    }
10888    default:
10889      return;
10890    }
10891
10892    // All assembler immediates are 64-bit integers.
10893    Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
10894    break;
10895  }
10896
10897  if (Result.getNode()) {
10898    Ops.push_back(Result);
10899    return;
10900  }
10901
10902  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
10903}
10904
10905//===----------------------------------------------------------------------===//
10906//                     AArch64 Advanced SIMD Support
10907//===----------------------------------------------------------------------===//
10908
10909/// WidenVector - Given a value in the V64 register class, produce the
10910/// equivalent value in the V128 register class.
10911static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
10912  EVT VT = V64Reg.getValueType();
10913  unsigned NarrowSize = VT.getVectorNumElements();
10914  MVT EltTy = VT.getVectorElementType().getSimpleVT();
10915  MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
10916  SDLoc DL(V64Reg);
10917
10918  return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
10919                     V64Reg, DAG.getConstant(0, DL, MVT::i64));
10920}
10921
10922/// getExtFactor - Determine the adjustment factor for the position when
10923/// generating an "extract from vector registers" instruction.
10924static unsigned getExtFactor(SDValue &V) {
10925  EVT EltType = V.getValueType().getVectorElementType();
10926  return EltType.getSizeInBits() / 8;
10927}
10928
10929// Check if a vector is built from one vector via extracted elements of
10930// another together with an AND mask, ensuring that all elements fit
10931// within range. This can be reconstructed using AND and NEON's TBL1.
10932SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG) {
10933  assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
10934  SDLoc dl(Op);
10935  EVT VT = Op.getValueType();
10936  assert(!VT.isScalableVector() &&
10937         "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
10938
10939  // Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map
10940  // directly to TBL1.
10941  if (VT != MVT::v16i8 && VT != MVT::v8i8)
10942    return SDValue();
10943
10944  unsigned NumElts = VT.getVectorNumElements();
10945  assert((NumElts == 8 || NumElts == 16) &&
10946         "Need to have exactly 8 or 16 elements in vector.");
10947
10948  SDValue SourceVec;
10949  SDValue MaskSourceVec;
10950  SmallVector<SDValue, 16> AndMaskConstants;
10951
10952  for (unsigned i = 0; i < NumElts; ++i) {
10953    SDValue V = Op.getOperand(i);
10954    if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
10955      return SDValue();
10956
10957    SDValue OperandSourceVec = V.getOperand(0);
10958    if (!SourceVec)
10959      SourceVec = OperandSourceVec;
10960    else if (SourceVec != OperandSourceVec)
10961      return SDValue();
10962
10963    // This only looks at shuffles with elements that are
10964    // a) truncated by a constant AND mask extracted from a mask vector, or
10965    // b) extracted directly from a mask vector.
10966    SDValue MaskSource = V.getOperand(1);
10967    if (MaskSource.getOpcode() == ISD::AND) {
10968      if (!isa<ConstantSDNode>(MaskSource.getOperand(1)))
10969        return SDValue();
10970
10971      AndMaskConstants.push_back(MaskSource.getOperand(1));
10972      MaskSource = MaskSource->getOperand(0);
10973    } else if (!AndMaskConstants.empty()) {
10974      // Either all or no operands should have an AND mask.
10975      return SDValue();
10976    }
10977
10978    // An ANY_EXTEND may be inserted between the AND and the source vector
10979    // extraction. We don't care about that, so we can just skip it.
10980    if (MaskSource.getOpcode() == ISD::ANY_EXTEND)
10981      MaskSource = MaskSource.getOperand(0);
10982
10983    if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
10984      return SDValue();
10985
10986    SDValue MaskIdx = MaskSource.getOperand(1);
10987    if (!isa<ConstantSDNode>(MaskIdx) ||
10988        !cast<ConstantSDNode>(MaskIdx)->getConstantIntValue()->equalsInt(i))
10989      return SDValue();
10990
10991    // We only apply this if all elements come from the same vector with the
10992    // same vector type.
10993    if (!MaskSourceVec) {
10994      MaskSourceVec = MaskSource->getOperand(0);
10995      if (MaskSourceVec.getValueType() != VT)
10996        return SDValue();
10997    } else if (MaskSourceVec != MaskSource->getOperand(0)) {
10998      return SDValue();
10999    }
11000  }
11001
11002  // We need a v16i8 for TBL, so we extend the source with a placeholder vector
11003  // for v8i8 to get a v16i8. As the pattern we are replacing is extract +
11004  // insert, we know that the index in the mask must be smaller than the number
11005  // of elements in the source, or we would have an out-of-bounds access.
11006  if (NumElts == 8)
11007    SourceVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, SourceVec,
11008                            DAG.getUNDEF(VT));
11009
11010  // Preconditions met, so we can use a vector (AND +) TBL to build this vector.
11011  if (!AndMaskConstants.empty())
11012    MaskSourceVec = DAG.getNode(ISD::AND, dl, VT, MaskSourceVec,
11013                                DAG.getBuildVector(VT, dl, AndMaskConstants));
11014
11015  return DAG.getNode(
11016      ISD::INTRINSIC_WO_CHAIN, dl, VT,
11017      DAG.getConstant(Intrinsic::aarch64_neon_tbl1, dl, MVT::i32), SourceVec,
11018      MaskSourceVec);
11019}
11020
11021// Gather data to see if the operation can be modelled as a
11022// shuffle in combination with VEXTs.
11023SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
11024                                                  SelectionDAG &DAG) const {
11025  assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
11026  LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
11027  SDLoc dl(Op);
11028  EVT VT = Op.getValueType();
11029  assert(!VT.isScalableVector() &&
11030         "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
11031  unsigned NumElts = VT.getVectorNumElements();
11032
11033  struct ShuffleSourceInfo {
11034    SDValue Vec;
11035    unsigned MinElt;
11036    unsigned MaxElt;
11037
11038    // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
11039    // be compatible with the shuffle we intend to construct. As a result
11040    // ShuffleVec will be some sliding window into the original Vec.
11041    SDValue ShuffleVec;
11042
11043    // Code should guarantee that element i in Vec starts at element "WindowBase
11044    // + i * WindowScale in ShuffleVec".
11045    int WindowBase;
11046    int WindowScale;
11047
11048    ShuffleSourceInfo(SDValue Vec)
11049      : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
11050          ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
11051
11052    bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
11053  };
11054
11055  // First gather all vectors used as an immediate source for this BUILD_VECTOR
11056  // node.
11057  SmallVector<ShuffleSourceInfo, 2> Sources;
11058  for (unsigned i = 0; i < NumElts; ++i) {
11059    SDValue V = Op.getOperand(i);
11060    if (V.isUndef())
11061      continue;
11062    else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11063             !isa<ConstantSDNode>(V.getOperand(1)) ||
11064             V.getOperand(0).getValueType().isScalableVector()) {
11065      LLVM_DEBUG(
11066          dbgs() << "Reshuffle failed: "
11067                    "a shuffle can only come from building a vector from "
11068                    "various elements of other fixed-width vectors, provided "
11069                    "their indices are constant\n");
11070      return SDValue();
11071    }
11072
11073    // Add this element source to the list if it's not already there.
11074    SDValue SourceVec = V.getOperand(0);
11075    auto Source = find(Sources, SourceVec);
11076    if (Source == Sources.end())
11077      Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
11078
11079    // Update the minimum and maximum lane number seen.
11080    unsigned EltNo = V.getConstantOperandVal(1);
11081    Source->MinElt = std::min(Source->MinElt, EltNo);
11082    Source->MaxElt = std::max(Source->MaxElt, EltNo);
11083  }
11084
11085  // If we have 3 or 4 sources, try to generate a TBL, which will at least be
11086  // better than moving to/from gpr registers for larger vectors.
11087  if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
11088    // Construct a mask for the tbl. We may need to adjust the index for types
11089    // larger than i8.
11090    SmallVector<unsigned, 16> Mask;
11091    unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
11092    for (unsigned I = 0; I < NumElts; ++I) {
11093      SDValue V = Op.getOperand(I);
11094      if (V.isUndef()) {
11095        for (unsigned OF = 0; OF < OutputFactor; OF++)
11096          Mask.push_back(-1);
11097        continue;
11098      }
11099      // Set the Mask lanes adjusted for the size of the input and output
11100      // lanes. The Mask is always i8, so it will set OutputFactor lanes per
11101      // output element, adjusted in their positions per input and output types.
11102      unsigned Lane = V.getConstantOperandVal(1);
11103      for (unsigned S = 0; S < Sources.size(); S++) {
11104        if (V.getOperand(0) == Sources[S].Vec) {
11105          unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
11106          unsigned InputBase = 16 * S + Lane * InputSize / 8;
11107          for (unsigned OF = 0; OF < OutputFactor; OF++)
11108            Mask.push_back(InputBase + OF);
11109          break;
11110        }
11111      }
11112    }
11113
11114    // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
11115    // v16i8, and the TBLMask
11116    SmallVector<SDValue, 16> TBLOperands;
11117    TBLOperands.push_back(DAG.getConstant(Sources.size() == 3
11118                                              ? Intrinsic::aarch64_neon_tbl3
11119                                              : Intrinsic::aarch64_neon_tbl4,
11120                                          dl, MVT::i32));
11121    for (unsigned i = 0; i < Sources.size(); i++) {
11122      SDValue Src = Sources[i].Vec;
11123      EVT SrcVT = Src.getValueType();
11124      Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
11125      assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
11126             "Expected a legally typed vector");
11127      if (SrcVT.is64BitVector())
11128        Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, Src,
11129                          DAG.getUNDEF(MVT::v8i8));
11130      TBLOperands.push_back(Src);
11131    }
11132
11133    SmallVector<SDValue, 16> TBLMask;
11134    for (unsigned i = 0; i < Mask.size(); i++)
11135      TBLMask.push_back(DAG.getConstant(Mask[i], dl, MVT::i32));
11136    assert((Mask.size() == 8 || Mask.size() == 16) &&
11137           "Expected a v8i8 or v16i8 Mask");
11138    TBLOperands.push_back(
11139        DAG.getBuildVector(Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, dl, TBLMask));
11140
11141    SDValue Shuffle =
11142        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
11143                    Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);
11144    return DAG.getBitcast(VT, Shuffle);
11145  }
11146
11147  if (Sources.size() > 2) {
11148    LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
11149                      << "sensible when at most two source vectors are "
11150                      << "involved\n");
11151    return SDValue();
11152  }
11153
11154  // Find out the smallest element size among result and two sources, and use
11155  // it as element size to build the shuffle_vector.
11156  EVT SmallestEltTy = VT.getVectorElementType();
11157  for (auto &Source : Sources) {
11158    EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
11159    if (SrcEltTy.bitsLT(SmallestEltTy)) {
11160      SmallestEltTy = SrcEltTy;
11161    }
11162  }
11163  unsigned ResMultiplier =
11164      VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
11165  uint64_t VTSize = VT.getFixedSizeInBits();
11166  NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
11167  EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
11168
11169  // If the source vector is too wide or too narrow, we may nevertheless be able
11170  // to construct a compatible shuffle either by concatenating it with UNDEF or
11171  // extracting a suitable range of elements.
11172  for (auto &Src : Sources) {
11173    EVT SrcVT = Src.ShuffleVec.getValueType();
11174
11175    TypeSize SrcVTSize = SrcVT.getSizeInBits();
11176    if (SrcVTSize == TypeSize::getFixed(VTSize))
11177      continue;
11178
11179    // This stage of the search produces a source with the same element type as
11180    // the original, but with a total width matching the BUILD_VECTOR output.
11181    EVT EltVT = SrcVT.getVectorElementType();
11182    unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
11183    EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
11184
11185    if (SrcVTSize.getFixedValue() < VTSize) {
11186      assert(2 * SrcVTSize == VTSize);
11187      // We can pad out the smaller vector for free, so if it's part of a
11188      // shuffle...
11189      Src.ShuffleVec =
11190          DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
11191                      DAG.getUNDEF(Src.ShuffleVec.getValueType()));
11192      continue;
11193    }
11194
11195    if (SrcVTSize.getFixedValue() != 2 * VTSize) {
11196      LLVM_DEBUG(
11197          dbgs() << "Reshuffle failed: result vector too small to extract\n");
11198      return SDValue();
11199    }
11200
11201    if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
11202      LLVM_DEBUG(
11203          dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
11204      return SDValue();
11205    }
11206
11207    if (Src.MinElt >= NumSrcElts) {
11208      // The extraction can just take the second half
11209      Src.ShuffleVec =
11210          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11211                      DAG.getConstant(NumSrcElts, dl, MVT::i64));
11212      Src.WindowBase = -NumSrcElts;
11213    } else if (Src.MaxElt < NumSrcElts) {
11214      // The extraction can just take the first half
11215      Src.ShuffleVec =
11216          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11217                      DAG.getConstant(0, dl, MVT::i64));
11218    } else {
11219      // An actual VEXT is needed
11220      SDValue VEXTSrc1 =
11221          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11222                      DAG.getConstant(0, dl, MVT::i64));
11223      SDValue VEXTSrc2 =
11224          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11225                      DAG.getConstant(NumSrcElts, dl, MVT::i64));
11226      unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
11227
11228      if (!SrcVT.is64BitVector()) {
11229        LLVM_DEBUG(
11230          dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
11231                    "for SVE vectors.");
11232        return SDValue();
11233      }
11234
11235      Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
11236                                   VEXTSrc2,
11237                                   DAG.getConstant(Imm, dl, MVT::i32));
11238      Src.WindowBase = -Src.MinElt;
11239    }
11240  }
11241
11242  // Another possible incompatibility occurs from the vector element types. We
11243  // can fix this by bitcasting the source vectors to the same type we intend
11244  // for the shuffle.
11245  for (auto &Src : Sources) {
11246    EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
11247    if (SrcEltTy == SmallestEltTy)
11248      continue;
11249    assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
11250    if (DAG.getDataLayout().isBigEndian()) {
11251      Src.ShuffleVec =
11252          DAG.getNode(AArch64ISD::NVCAST, dl, ShuffleVT, Src.ShuffleVec);
11253    } else {
11254      Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
11255    }
11256    Src.WindowScale =
11257        SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
11258    Src.WindowBase *= Src.WindowScale;
11259  }
11260
11261  // Final check before we try to actually produce a shuffle.
11262  LLVM_DEBUG(for (auto Src
11263                  : Sources)
11264                 assert(Src.ShuffleVec.getValueType() == ShuffleVT););
11265
11266  // The stars all align, our next step is to produce the mask for the shuffle.
11267  SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
11268  int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
11269  for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
11270    SDValue Entry = Op.getOperand(i);
11271    if (Entry.isUndef())
11272      continue;
11273
11274    auto Src = find(Sources, Entry.getOperand(0));
11275    int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
11276
11277    // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
11278    // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
11279    // segment.
11280    EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
11281    int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
11282                               VT.getScalarSizeInBits());
11283    int LanesDefined = BitsDefined / BitsPerShuffleLane;
11284
11285    // This source is expected to fill ResMultiplier lanes of the final shuffle,
11286    // starting at the appropriate offset.
11287    int *LaneMask = &Mask[i * ResMultiplier];
11288
11289    int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
11290    ExtractBase += NumElts * (Src - Sources.begin());
11291    for (int j = 0; j < LanesDefined; ++j)
11292      LaneMask[j] = ExtractBase + j;
11293  }
11294
11295  // Final check before we try to produce nonsense...
11296  if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
11297    LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
11298    return SDValue();
11299  }
11300
11301  SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
11302  for (unsigned i = 0; i < Sources.size(); ++i)
11303    ShuffleOps[i] = Sources[i].ShuffleVec;
11304
11305  SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
11306                                         ShuffleOps[1], Mask);
11307  SDValue V;
11308  if (DAG.getDataLayout().isBigEndian()) {
11309    V = DAG.getNode(AArch64ISD::NVCAST, dl, VT, Shuffle);
11310  } else {
11311    V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
11312  }
11313
11314  LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
11315             dbgs() << "Reshuffle, creating node: "; V.dump(););
11316
11317  return V;
11318}
11319
11320// check if an EXT instruction can handle the shuffle mask when the
11321// vector sources of the shuffle are the same.
11322static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
11323  unsigned NumElts = VT.getVectorNumElements();
11324
11325  // Assume that the first shuffle index is not UNDEF.  Fail if it is.
11326  if (M[0] < 0)
11327    return false;
11328
11329  Imm = M[0];
11330
11331  // If this is a VEXT shuffle, the immediate value is the index of the first
11332  // element.  The other shuffle indices must be the successive elements after
11333  // the first one.
11334  unsigned ExpectedElt = Imm;
11335  for (unsigned i = 1; i < NumElts; ++i) {
11336    // Increment the expected index.  If it wraps around, just follow it
11337    // back to index zero and keep going.
11338    ++ExpectedElt;
11339    if (ExpectedElt == NumElts)
11340      ExpectedElt = 0;
11341
11342    if (M[i] < 0)
11343      continue; // ignore UNDEF indices
11344    if (ExpectedElt != static_cast<unsigned>(M[i]))
11345      return false;
11346  }
11347
11348  return true;
11349}
11350
11351// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
11352// v4i32s. This is really a truncate, which we can construct out of (legal)
11353// concats and truncate nodes.
11354static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG) {
11355  if (V.getValueType() != MVT::v16i8)
11356    return SDValue();
11357  assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
11358
11359  for (unsigned X = 0; X < 4; X++) {
11360    // Check the first item in each group is an extract from lane 0 of a v4i32
11361    // or v4i16.
11362    SDValue BaseExt = V.getOperand(X * 4);
11363    if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11364        (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
11365         BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
11366        !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
11367        BaseExt.getConstantOperandVal(1) != 0)
11368      return SDValue();
11369    SDValue Base = BaseExt.getOperand(0);
11370    // And check the other items are extracts from the same vector.
11371    for (unsigned Y = 1; Y < 4; Y++) {
11372      SDValue Ext = V.getOperand(X * 4 + Y);
11373      if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11374          Ext.getOperand(0) != Base ||
11375          !isa<ConstantSDNode>(Ext.getOperand(1)) ||
11376          Ext.getConstantOperandVal(1) != Y)
11377        return SDValue();
11378    }
11379  }
11380
11381  // Turn the buildvector into a series of truncates and concates, which will
11382  // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
11383  // concat together to produce 2 v8i16. These are both truncated and concat
11384  // together.
11385  SDLoc DL(V);
11386  SDValue Trunc[4] = {
11387      V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
11388      V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
11389  for (SDValue &V : Trunc)
11390    if (V.getValueType() == MVT::v4i32)
11391      V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);
11392  SDValue Concat0 =
11393      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
11394  SDValue Concat1 =
11395      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
11396  SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
11397  SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
11398  return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
11399}
11400
11401/// Check if a vector shuffle corresponds to a DUP instructions with a larger
11402/// element width than the vector lane type. If that is the case the function
11403/// returns true and writes the value of the DUP instruction lane operand into
11404/// DupLaneOp
11405static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
11406                          unsigned &DupLaneOp) {
11407  assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
11408         "Only possible block sizes for wide DUP are: 16, 32, 64");
11409
11410  if (BlockSize <= VT.getScalarSizeInBits())
11411    return false;
11412  if (BlockSize % VT.getScalarSizeInBits() != 0)
11413    return false;
11414  if (VT.getSizeInBits() % BlockSize != 0)
11415    return false;
11416
11417  size_t SingleVecNumElements = VT.getVectorNumElements();
11418  size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
11419  size_t NumBlocks = VT.getSizeInBits() / BlockSize;
11420
11421  // We are looking for masks like
11422  // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
11423  // might be replaced by 'undefined'. BlockIndices will eventually contain
11424  // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
11425  // for the above examples)
11426  SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
11427  for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
11428    for (size_t I = 0; I < NumEltsPerBlock; I++) {
11429      int Elt = M[BlockIndex * NumEltsPerBlock + I];
11430      if (Elt < 0)
11431        continue;
11432      // For now we don't support shuffles that use the second operand
11433      if ((unsigned)Elt >= SingleVecNumElements)
11434        return false;
11435      if (BlockElts[I] < 0)
11436        BlockElts[I] = Elt;
11437      else if (BlockElts[I] != Elt)
11438        return false;
11439    }
11440
11441  // We found a candidate block (possibly with some undefs). It must be a
11442  // sequence of consecutive integers starting with a value divisible by
11443  // NumEltsPerBlock with some values possibly replaced by undef-s.
11444
11445  // Find first non-undef element
11446  auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
11447  assert(FirstRealEltIter != BlockElts.end() &&
11448         "Shuffle with all-undefs must have been caught by previous cases, "
11449         "e.g. isSplat()");
11450  if (FirstRealEltIter == BlockElts.end()) {
11451    DupLaneOp = 0;
11452    return true;
11453  }
11454
11455  // Index of FirstRealElt in BlockElts
11456  size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
11457
11458  if ((unsigned)*FirstRealEltIter < FirstRealIndex)
11459    return false;
11460  // BlockElts[0] must have the following value if it isn't undef:
11461  size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
11462
11463  // Check the first element
11464  if (Elt0 % NumEltsPerBlock != 0)
11465    return false;
11466  // Check that the sequence indeed consists of consecutive integers (modulo
11467  // undefs)
11468  for (size_t I = 0; I < NumEltsPerBlock; I++)
11469    if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
11470      return false;
11471
11472  DupLaneOp = Elt0 / NumEltsPerBlock;
11473  return true;
11474}
11475
11476// check if an EXT instruction can handle the shuffle mask when the
11477// vector sources of the shuffle are different.
11478static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
11479                      unsigned &Imm) {
11480  // Look for the first non-undef element.
11481  const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
11482
11483  // Benefit form APInt to handle overflow when calculating expected element.
11484  unsigned NumElts = VT.getVectorNumElements();
11485  unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
11486  APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
11487  // The following shuffle indices must be the successive elements after the
11488  // first real element.
11489  bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) {
11490    return Elt != ExpectedElt++ && Elt != -1;
11491  });
11492  if (FoundWrongElt)
11493    return false;
11494
11495  // The index of an EXT is the first element if it is not UNDEF.
11496  // Watch out for the beginning UNDEFs. The EXT index should be the expected
11497  // value of the first element.  E.g.
11498  // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
11499  // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
11500  // ExpectedElt is the last mask index plus 1.
11501  Imm = ExpectedElt.getZExtValue();
11502
11503  // There are two difference cases requiring to reverse input vectors.
11504  // For example, for vector <4 x i32> we have the following cases,
11505  // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
11506  // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
11507  // For both cases, we finally use mask <5, 6, 7, 0>, which requires
11508  // to reverse two input vectors.
11509  if (Imm < NumElts)
11510    ReverseEXT = true;
11511  else
11512    Imm -= NumElts;
11513
11514  return true;
11515}
11516
11517/// isREVMask - Check if a vector shuffle corresponds to a REV
11518/// instruction with the specified blocksize.  (The order of the elements
11519/// within each block of the vector is reversed.)
11520static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
11521  assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64 ||
11522          BlockSize == 128) &&
11523         "Only possible block sizes for REV are: 16, 32, 64, 128");
11524
11525  unsigned EltSz = VT.getScalarSizeInBits();
11526  unsigned NumElts = VT.getVectorNumElements();
11527  unsigned BlockElts = M[0] + 1;
11528  // If the first shuffle index is UNDEF, be optimistic.
11529  if (M[0] < 0)
11530    BlockElts = BlockSize / EltSz;
11531
11532  if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
11533    return false;
11534
11535  for (unsigned i = 0; i < NumElts; ++i) {
11536    if (M[i] < 0)
11537      continue; // ignore UNDEF indices
11538    if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
11539      return false;
11540  }
11541
11542  return true;
11543}
11544
11545static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11546  unsigned NumElts = VT.getVectorNumElements();
11547  if (NumElts % 2 != 0)
11548    return false;
11549  WhichResult = (M[0] == 0 ? 0 : 1);
11550  unsigned Idx = WhichResult * NumElts / 2;
11551  for (unsigned i = 0; i != NumElts; i += 2) {
11552    if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
11553        (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
11554      return false;
11555    Idx += 1;
11556  }
11557
11558  return true;
11559}
11560
11561static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11562  unsigned NumElts = VT.getVectorNumElements();
11563  WhichResult = (M[0] == 0 ? 0 : 1);
11564  for (unsigned i = 0; i != NumElts; ++i) {
11565    if (M[i] < 0)
11566      continue; // ignore UNDEF indices
11567    if ((unsigned)M[i] != 2 * i + WhichResult)
11568      return false;
11569  }
11570
11571  return true;
11572}
11573
11574static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11575  unsigned NumElts = VT.getVectorNumElements();
11576  if (NumElts % 2 != 0)
11577    return false;
11578  WhichResult = (M[0] == 0 ? 0 : 1);
11579  for (unsigned i = 0; i < NumElts; i += 2) {
11580    if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
11581        (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
11582      return false;
11583  }
11584  return true;
11585}
11586
11587/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
11588/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
11589/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
11590static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11591  unsigned NumElts = VT.getVectorNumElements();
11592  if (NumElts % 2 != 0)
11593    return false;
11594  WhichResult = (M[0] == 0 ? 0 : 1);
11595  unsigned Idx = WhichResult * NumElts / 2;
11596  for (unsigned i = 0; i != NumElts; i += 2) {
11597    if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
11598        (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
11599      return false;
11600    Idx += 1;
11601  }
11602
11603  return true;
11604}
11605
11606/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
11607/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
11608/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
11609static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11610  unsigned Half = VT.getVectorNumElements() / 2;
11611  WhichResult = (M[0] == 0 ? 0 : 1);
11612  for (unsigned j = 0; j != 2; ++j) {
11613    unsigned Idx = WhichResult;
11614    for (unsigned i = 0; i != Half; ++i) {
11615      int MIdx = M[i + j * Half];
11616      if (MIdx >= 0 && (unsigned)MIdx != Idx)
11617        return false;
11618      Idx += 2;
11619    }
11620  }
11621
11622  return true;
11623}
11624
11625/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
11626/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
11627/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
11628static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11629  unsigned NumElts = VT.getVectorNumElements();
11630  if (NumElts % 2 != 0)
11631    return false;
11632  WhichResult = (M[0] == 0 ? 0 : 1);
11633  for (unsigned i = 0; i < NumElts; i += 2) {
11634    if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
11635        (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
11636      return false;
11637  }
11638  return true;
11639}
11640
11641static bool isINSMask(ArrayRef<int> M, int NumInputElements,
11642                      bool &DstIsLeft, int &Anomaly) {
11643  if (M.size() != static_cast<size_t>(NumInputElements))
11644    return false;
11645
11646  int NumLHSMatch = 0, NumRHSMatch = 0;
11647  int LastLHSMismatch = -1, LastRHSMismatch = -1;
11648
11649  for (int i = 0; i < NumInputElements; ++i) {
11650    if (M[i] == -1) {
11651      ++NumLHSMatch;
11652      ++NumRHSMatch;
11653      continue;
11654    }
11655
11656    if (M[i] == i)
11657      ++NumLHSMatch;
11658    else
11659      LastLHSMismatch = i;
11660
11661    if (M[i] == i + NumInputElements)
11662      ++NumRHSMatch;
11663    else
11664      LastRHSMismatch = i;
11665  }
11666
11667  if (NumLHSMatch == NumInputElements - 1) {
11668    DstIsLeft = true;
11669    Anomaly = LastLHSMismatch;
11670    return true;
11671  } else if (NumRHSMatch == NumInputElements - 1) {
11672    DstIsLeft = false;
11673    Anomaly = LastRHSMismatch;
11674    return true;
11675  }
11676
11677  return false;
11678}
11679
11680static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
11681  if (VT.getSizeInBits() != 128)
11682    return false;
11683
11684  unsigned NumElts = VT.getVectorNumElements();
11685
11686  for (int I = 0, E = NumElts / 2; I != E; I++) {
11687    if (Mask[I] != I)
11688      return false;
11689  }
11690
11691  int Offset = NumElts / 2;
11692  for (int I = NumElts / 2, E = NumElts; I != E; I++) {
11693    if (Mask[I] != I + SplitLHS * Offset)
11694      return false;
11695  }
11696
11697  return true;
11698}
11699
11700static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
11701  SDLoc DL(Op);
11702  EVT VT = Op.getValueType();
11703  SDValue V0 = Op.getOperand(0);
11704  SDValue V1 = Op.getOperand(1);
11705  ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
11706
11707  if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
11708      VT.getVectorElementType() != V1.getValueType().getVectorElementType())
11709    return SDValue();
11710
11711  bool SplitV0 = V0.getValueSizeInBits() == 128;
11712
11713  if (!isConcatMask(Mask, VT, SplitV0))
11714    return SDValue();
11715
11716  EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
11717  if (SplitV0) {
11718    V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
11719                     DAG.getConstant(0, DL, MVT::i64));
11720  }
11721  if (V1.getValueSizeInBits() == 128) {
11722    V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
11723                     DAG.getConstant(0, DL, MVT::i64));
11724  }
11725  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
11726}
11727
11728/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
11729/// the specified operations to build the shuffle. ID is the perfect-shuffle
11730//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
11731//table entry and LHS/RHS are the immediate inputs for this stage of the
11732//shuffle.
11733static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1,
11734                                      SDValue V2, unsigned PFEntry, SDValue LHS,
11735                                      SDValue RHS, SelectionDAG &DAG,
11736                                      const SDLoc &dl) {
11737  unsigned OpNum = (PFEntry >> 26) & 0x0F;
11738  unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
11739  unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
11740
11741  enum {
11742    OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
11743    OP_VREV,
11744    OP_VDUP0,
11745    OP_VDUP1,
11746    OP_VDUP2,
11747    OP_VDUP3,
11748    OP_VEXT1,
11749    OP_VEXT2,
11750    OP_VEXT3,
11751    OP_VUZPL,  // VUZP, left result
11752    OP_VUZPR,  // VUZP, right result
11753    OP_VZIPL,  // VZIP, left result
11754    OP_VZIPR,  // VZIP, right result
11755    OP_VTRNL,  // VTRN, left result
11756    OP_VTRNR,  // VTRN, right result
11757    OP_MOVLANE // Move lane. RHSID is the lane to move into
11758  };
11759
11760  if (OpNum == OP_COPY) {
11761    if (LHSID == (1 * 9 + 2) * 9 + 3)
11762      return LHS;
11763    assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
11764    return RHS;
11765  }
11766
11767  if (OpNum == OP_MOVLANE) {
11768    // Decompose a PerfectShuffle ID to get the Mask for lane Elt
11769    auto getPFIDLane = [](unsigned ID, int Elt) -> int {
11770      assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
11771      Elt = 3 - Elt;
11772      while (Elt > 0) {
11773        ID /= 9;
11774        Elt--;
11775      }
11776      return (ID % 9 == 8) ? -1 : ID % 9;
11777    };
11778
11779    // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
11780    // get the lane to move from the PFID, which is always from the
11781    // original vectors (V1 or V2).
11782    SDValue OpLHS = GeneratePerfectShuffle(
11783        LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
11784    EVT VT = OpLHS.getValueType();
11785    assert(RHSID < 8 && "Expected a lane index for RHSID!");
11786    unsigned ExtLane = 0;
11787    SDValue Input;
11788
11789    // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
11790    // convert into a higher type.
11791    if (RHSID & 0x4) {
11792      int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
11793      if (MaskElt == -1)
11794        MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
11795      assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
11796      ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
11797      Input = MaskElt < 2 ? V1 : V2;
11798      if (VT.getScalarSizeInBits() == 16) {
11799        Input = DAG.getBitcast(MVT::v2f32, Input);
11800        OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
11801      } else {
11802        assert(VT.getScalarSizeInBits() == 32 &&
11803               "Expected 16 or 32 bit shuffle elemements");
11804        Input = DAG.getBitcast(MVT::v2f64, Input);
11805        OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
11806      }
11807    } else {
11808      int MaskElt = getPFIDLane(ID, RHSID);
11809      assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
11810      ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
11811      Input = MaskElt < 4 ? V1 : V2;
11812      // Be careful about creating illegal types. Use f16 instead of i16.
11813      if (VT == MVT::v4i16) {
11814        Input = DAG.getBitcast(MVT::v4f16, Input);
11815        OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
11816      }
11817    }
11818    SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
11819                              Input.getValueType().getVectorElementType(),
11820                              Input, DAG.getVectorIdxConstant(ExtLane, dl));
11821    SDValue Ins =
11822        DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Input.getValueType(), OpLHS,
11823                    Ext, DAG.getVectorIdxConstant(RHSID & 0x3, dl));
11824    return DAG.getBitcast(VT, Ins);
11825  }
11826
11827  SDValue OpLHS, OpRHS;
11828  OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
11829                                 RHS, DAG, dl);
11830  OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
11831                                 RHS, DAG, dl);
11832  EVT VT = OpLHS.getValueType();
11833
11834  switch (OpNum) {
11835  default:
11836    llvm_unreachable("Unknown shuffle opcode!");
11837  case OP_VREV:
11838    // VREV divides the vector in half and swaps within the half.
11839    if (VT.getVectorElementType() == MVT::i32 ||
11840        VT.getVectorElementType() == MVT::f32)
11841      return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
11842    // vrev <4 x i16> -> REV32
11843    if (VT.getVectorElementType() == MVT::i16 ||
11844        VT.getVectorElementType() == MVT::f16 ||
11845        VT.getVectorElementType() == MVT::bf16)
11846      return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
11847    // vrev <4 x i8> -> REV16
11848    assert(VT.getVectorElementType() == MVT::i8);
11849    return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
11850  case OP_VDUP0:
11851  case OP_VDUP1:
11852  case OP_VDUP2:
11853  case OP_VDUP3: {
11854    EVT EltTy = VT.getVectorElementType();
11855    unsigned Opcode;
11856    if (EltTy == MVT::i8)
11857      Opcode = AArch64ISD::DUPLANE8;
11858    else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
11859      Opcode = AArch64ISD::DUPLANE16;
11860    else if (EltTy == MVT::i32 || EltTy == MVT::f32)
11861      Opcode = AArch64ISD::DUPLANE32;
11862    else if (EltTy == MVT::i64 || EltTy == MVT::f64)
11863      Opcode = AArch64ISD::DUPLANE64;
11864    else
11865      llvm_unreachable("Invalid vector element type?");
11866
11867    if (VT.getSizeInBits() == 64)
11868      OpLHS = WidenVector(OpLHS, DAG);
11869    SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
11870    return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
11871  }
11872  case OP_VEXT1:
11873  case OP_VEXT2:
11874  case OP_VEXT3: {
11875    unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
11876    return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
11877                       DAG.getConstant(Imm, dl, MVT::i32));
11878  }
11879  case OP_VUZPL:
11880    return DAG.getNode(AArch64ISD::UZP1, dl, VT, OpLHS, OpRHS);
11881  case OP_VUZPR:
11882    return DAG.getNode(AArch64ISD::UZP2, dl, VT, OpLHS, OpRHS);
11883  case OP_VZIPL:
11884    return DAG.getNode(AArch64ISD::ZIP1, dl, VT, OpLHS, OpRHS);
11885  case OP_VZIPR:
11886    return DAG.getNode(AArch64ISD::ZIP2, dl, VT, OpLHS, OpRHS);
11887  case OP_VTRNL:
11888    return DAG.getNode(AArch64ISD::TRN1, dl, VT, OpLHS, OpRHS);
11889  case OP_VTRNR:
11890    return DAG.getNode(AArch64ISD::TRN2, dl, VT, OpLHS, OpRHS);
11891  }
11892}
11893
11894static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
11895                           SelectionDAG &DAG) {
11896  // Check to see if we can use the TBL instruction.
11897  SDValue V1 = Op.getOperand(0);
11898  SDValue V2 = Op.getOperand(1);
11899  SDLoc DL(Op);
11900
11901  EVT EltVT = Op.getValueType().getVectorElementType();
11902  unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
11903
11904  bool Swap = false;
11905  if (V1.isUndef() || isZerosVector(V1.getNode())) {
11906    std::swap(V1, V2);
11907    Swap = true;
11908  }
11909
11910  // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
11911  // out of range values with 0s. We do need to make sure that any out-of-range
11912  // values are really out-of-range for a v16i8 vector.
11913  bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode());
11914  MVT IndexVT = MVT::v8i8;
11915  unsigned IndexLen = 8;
11916  if (Op.getValueSizeInBits() == 128) {
11917    IndexVT = MVT::v16i8;
11918    IndexLen = 16;
11919  }
11920
11921  SmallVector<SDValue, 8> TBLMask;
11922  for (int Val : ShuffleMask) {
11923    for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
11924      unsigned Offset = Byte + Val * BytesPerElt;
11925      if (Swap)
11926        Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
11927      if (IsUndefOrZero && Offset >= IndexLen)
11928        Offset = 255;
11929      TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
11930    }
11931  }
11932
11933  SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
11934  SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
11935
11936  SDValue Shuffle;
11937  if (IsUndefOrZero) {
11938    if (IndexLen == 8)
11939      V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
11940    Shuffle = DAG.getNode(
11941        ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
11942        DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
11943        DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
11944  } else {
11945    if (IndexLen == 8) {
11946      V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
11947      Shuffle = DAG.getNode(
11948          ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
11949          DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
11950          DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
11951    } else {
11952      // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
11953      // cannot currently represent the register constraints on the input
11954      // table registers.
11955      //  Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
11956      //                   DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
11957      //                   IndexLen));
11958      Shuffle = DAG.getNode(
11959          ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
11960          DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
11961          V2Cst,
11962          DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
11963    }
11964  }
11965  return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
11966}
11967
11968static unsigned getDUPLANEOp(EVT EltType) {
11969  if (EltType == MVT::i8)
11970    return AArch64ISD::DUPLANE8;
11971  if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
11972    return AArch64ISD::DUPLANE16;
11973  if (EltType == MVT::i32 || EltType == MVT::f32)
11974    return AArch64ISD::DUPLANE32;
11975  if (EltType == MVT::i64 || EltType == MVT::f64)
11976    return AArch64ISD::DUPLANE64;
11977
11978  llvm_unreachable("Invalid vector element type?");
11979}
11980
11981static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
11982                            unsigned Opcode, SelectionDAG &DAG) {
11983  // Try to eliminate a bitcasted extract subvector before a DUPLANE.
11984  auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
11985    // Match: dup (bitcast (extract_subv X, C)), LaneC
11986    if (BitCast.getOpcode() != ISD::BITCAST ||
11987        BitCast.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
11988      return false;
11989
11990    // The extract index must align in the destination type. That may not
11991    // happen if the bitcast is from narrow to wide type.
11992    SDValue Extract = BitCast.getOperand(0);
11993    unsigned ExtIdx = Extract.getConstantOperandVal(1);
11994    unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
11995    unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
11996    unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
11997    if (ExtIdxInBits % CastedEltBitWidth != 0)
11998      return false;
11999
12000    // Can't handle cases where vector size is not 128-bit
12001    if (!Extract.getOperand(0).getValueType().is128BitVector())
12002      return false;
12003
12004    // Update the lane value by offsetting with the scaled extract index.
12005    LaneC += ExtIdxInBits / CastedEltBitWidth;
12006
12007    // Determine the casted vector type of the wide vector input.
12008    // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
12009    // Examples:
12010    // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
12011    // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
12012    unsigned SrcVecNumElts =
12013        Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
12014    CastVT = MVT::getVectorVT(BitCast.getSimpleValueType().getScalarType(),
12015                              SrcVecNumElts);
12016    return true;
12017  };
12018  MVT CastVT;
12019  if (getScaledOffsetDup(V, Lane, CastVT)) {
12020    V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
12021  } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
12022             V.getOperand(0).getValueType().is128BitVector()) {
12023    // The lane is incremented by the index of the extract.
12024    // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
12025    Lane += V.getConstantOperandVal(1);
12026    V = V.getOperand(0);
12027  } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
12028    // The lane is decremented if we are splatting from the 2nd operand.
12029    // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
12030    unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
12031    Lane -= Idx * VT.getVectorNumElements() / 2;
12032    V = WidenVector(V.getOperand(Idx), DAG);
12033  } else if (VT.getSizeInBits() == 64) {
12034    // Widen the operand to 128-bit register with undef.
12035    V = WidenVector(V, DAG);
12036  }
12037  return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64));
12038}
12039
12040// Return true if we can get a new shuffle mask by checking the parameter mask
12041// array to test whether every two adjacent mask values are continuous and
12042// starting from an even number.
12043static bool isWideTypeMask(ArrayRef<int> M, EVT VT,
12044                           SmallVectorImpl<int> &NewMask) {
12045  unsigned NumElts = VT.getVectorNumElements();
12046  if (NumElts % 2 != 0)
12047    return false;
12048
12049  NewMask.clear();
12050  for (unsigned i = 0; i < NumElts; i += 2) {
12051    int M0 = M[i];
12052    int M1 = M[i + 1];
12053
12054    // If both elements are undef, new mask is undef too.
12055    if (M0 == -1 && M1 == -1) {
12056      NewMask.push_back(-1);
12057      continue;
12058    }
12059
12060    if (M0 == -1 && M1 != -1 && (M1 % 2) == 1) {
12061      NewMask.push_back(M1 / 2);
12062      continue;
12063    }
12064
12065    if (M0 != -1 && (M0 % 2) == 0 && ((M0 + 1) == M1 || M1 == -1)) {
12066      NewMask.push_back(M0 / 2);
12067      continue;
12068    }
12069
12070    NewMask.clear();
12071    return false;
12072  }
12073
12074  assert(NewMask.size() == NumElts / 2 && "Incorrect size for mask!");
12075  return true;
12076}
12077
12078// Try to widen element type to get a new mask value for a better permutation
12079// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
12080// UZP1/2, TRN1/2, REV, INS, etc.
12081// For example:
12082//  shufflevector <4 x i32> %a, <4 x i32> %b,
12083//                <4 x i32> <i32 6, i32 7, i32 2, i32 3>
12084// is equivalent to:
12085//  shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
12086// Finally, we can get:
12087//  mov     v0.d[0], v1.d[1]
12088static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG) {
12089  SDLoc DL(Op);
12090  EVT VT = Op.getValueType();
12091  EVT ScalarVT = VT.getVectorElementType();
12092  unsigned ElementSize = ScalarVT.getFixedSizeInBits();
12093  SDValue V0 = Op.getOperand(0);
12094  SDValue V1 = Op.getOperand(1);
12095  ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
12096
12097  // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
12098  // We need to make sure the wider element type is legal. Thus, ElementSize
12099  // should be not larger than 32 bits, and i1 type should also be excluded.
12100  if (ElementSize > 32 || ElementSize == 1)
12101    return SDValue();
12102
12103  SmallVector<int, 8> NewMask;
12104  if (isWideTypeMask(Mask, VT, NewMask)) {
12105    MVT NewEltVT = VT.isFloatingPoint()
12106                       ? MVT::getFloatingPointVT(ElementSize * 2)
12107                       : MVT::getIntegerVT(ElementSize * 2);
12108    MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
12109    if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
12110      V0 = DAG.getBitcast(NewVT, V0);
12111      V1 = DAG.getBitcast(NewVT, V1);
12112      return DAG.getBitcast(VT,
12113                            DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
12114    }
12115  }
12116
12117  return SDValue();
12118}
12119
12120// Try to fold shuffle (tbl2, tbl2) into a single tbl4.
12121static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op,
12122                                               ArrayRef<int> ShuffleMask,
12123                                               SelectionDAG &DAG) {
12124  SDValue Tbl1 = Op->getOperand(0);
12125  SDValue Tbl2 = Op->getOperand(1);
12126  SDLoc dl(Op);
12127  SDValue Tbl2ID =
12128      DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, dl, MVT::i64);
12129
12130  EVT VT = Op.getValueType();
12131  if (Tbl1->getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
12132      Tbl1->getOperand(0) != Tbl2ID ||
12133      Tbl2->getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
12134      Tbl2->getOperand(0) != Tbl2ID)
12135    return SDValue();
12136
12137  if (Tbl1->getValueType(0) != MVT::v16i8 ||
12138      Tbl2->getValueType(0) != MVT::v16i8)
12139    return SDValue();
12140
12141  SDValue Mask1 = Tbl1->getOperand(3);
12142  SDValue Mask2 = Tbl2->getOperand(3);
12143  SmallVector<SDValue, 16> TBLMaskParts(16, SDValue());
12144  for (unsigned I = 0; I < 16; I++) {
12145    if (ShuffleMask[I] < 16)
12146      TBLMaskParts[I] = Mask1->getOperand(ShuffleMask[I]);
12147    else {
12148      auto *C =
12149          dyn_cast<ConstantSDNode>(Mask2->getOperand(ShuffleMask[I] - 16));
12150      if (!C)
12151        return SDValue();
12152      TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, dl, MVT::i32);
12153    }
12154  }
12155
12156  SDValue TBLMask = DAG.getBuildVector(VT, dl, TBLMaskParts);
12157  SDValue ID =
12158      DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, dl, MVT::i64);
12159
12160  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8,
12161                     {ID, Tbl1->getOperand(1), Tbl1->getOperand(2),
12162                      Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});
12163}
12164
12165// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
12166// but we don't have an appropriate instruction,
12167// so custom-lower it as ZIP1-with-zeros.
12168SDValue
12169AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
12170                                                     SelectionDAG &DAG) const {
12171  SDLoc dl(Op);
12172  EVT VT = Op.getValueType();
12173  SDValue SrcOp = Op.getOperand(0);
12174  EVT SrcVT = SrcOp.getValueType();
12175  assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 &&
12176         "Unexpected extension factor.");
12177  unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
12178  // FIXME: support multi-step zipping?
12179  if (Scale != 2)
12180    return SDValue();
12181  SDValue Zeros = DAG.getConstant(0, dl, SrcVT);
12182  return DAG.getBitcast(VT,
12183                        DAG.getNode(AArch64ISD::ZIP1, dl, SrcVT, SrcOp, Zeros));
12184}
12185
12186SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
12187                                                   SelectionDAG &DAG) const {
12188  SDLoc dl(Op);
12189  EVT VT = Op.getValueType();
12190
12191  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
12192
12193  if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
12194    return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
12195
12196  // Convert shuffles that are directly supported on NEON to target-specific
12197  // DAG nodes, instead of keeping them as shuffles and matching them again
12198  // during code selection.  This is more efficient and avoids the possibility
12199  // of inconsistencies between legalization and selection.
12200  ArrayRef<int> ShuffleMask = SVN->getMask();
12201
12202  SDValue V1 = Op.getOperand(0);
12203  SDValue V2 = Op.getOperand(1);
12204
12205  assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
12206  assert(ShuffleMask.size() == VT.getVectorNumElements() &&
12207         "Unexpected VECTOR_SHUFFLE mask size!");
12208
12209  if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
12210    return Res;
12211
12212  if (SVN->isSplat()) {
12213    int Lane = SVN->getSplatIndex();
12214    // If this is undef splat, generate it via "just" vdup, if possible.
12215    if (Lane == -1)
12216      Lane = 0;
12217
12218    if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
12219      return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
12220                         V1.getOperand(0));
12221    // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
12222    // constant. If so, we can just reference the lane's definition directly.
12223    if (V1.getOpcode() == ISD::BUILD_VECTOR &&
12224        !isa<ConstantSDNode>(V1.getOperand(Lane)))
12225      return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
12226
12227    // Otherwise, duplicate from the lane of the input vector.
12228    unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
12229    return constructDup(V1, Lane, dl, VT, Opcode, DAG);
12230  }
12231
12232  // Check if the mask matches a DUP for a wider element
12233  for (unsigned LaneSize : {64U, 32U, 16U}) {
12234    unsigned Lane = 0;
12235    if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
12236      unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
12237                                       : LaneSize == 32 ? AArch64ISD::DUPLANE32
12238                                                        : AArch64ISD::DUPLANE16;
12239      // Cast V1 to an integer vector with required lane size
12240      MVT NewEltTy = MVT::getIntegerVT(LaneSize);
12241      unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
12242      MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
12243      V1 = DAG.getBitcast(NewVecTy, V1);
12244      // Constuct the DUP instruction
12245      V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG);
12246      // Cast back to the original type
12247      return DAG.getBitcast(VT, V1);
12248    }
12249  }
12250
12251  if (isREVMask(ShuffleMask, VT, 64))
12252    return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
12253  if (isREVMask(ShuffleMask, VT, 32))
12254    return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
12255  if (isREVMask(ShuffleMask, VT, 16))
12256    return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
12257
12258  if (((VT.getVectorNumElements() == 8 && VT.getScalarSizeInBits() == 16) ||
12259       (VT.getVectorNumElements() == 16 && VT.getScalarSizeInBits() == 8)) &&
12260      ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size())) {
12261    SDValue Rev = DAG.getNode(AArch64ISD::REV64, dl, VT, V1);
12262    return DAG.getNode(AArch64ISD::EXT, dl, VT, Rev, Rev,
12263                       DAG.getConstant(8, dl, MVT::i32));
12264  }
12265
12266  bool ReverseEXT = false;
12267  unsigned Imm;
12268  if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
12269    if (ReverseEXT)
12270      std::swap(V1, V2);
12271    Imm *= getExtFactor(V1);
12272    return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
12273                       DAG.getConstant(Imm, dl, MVT::i32));
12274  } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
12275    Imm *= getExtFactor(V1);
12276    return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
12277                       DAG.getConstant(Imm, dl, MVT::i32));
12278  }
12279
12280  unsigned WhichResult;
12281  if (isZIPMask(ShuffleMask, VT, WhichResult)) {
12282    unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
12283    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
12284  }
12285  if (isUZPMask(ShuffleMask, VT, WhichResult)) {
12286    unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
12287    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
12288  }
12289  if (isTRNMask(ShuffleMask, VT, WhichResult)) {
12290    unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
12291    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
12292  }
12293
12294  if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
12295    unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
12296    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
12297  }
12298  if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
12299    unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
12300    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
12301  }
12302  if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
12303    unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
12304    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
12305  }
12306
12307  if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG))
12308    return Concat;
12309
12310  bool DstIsLeft;
12311  int Anomaly;
12312  int NumInputElements = V1.getValueType().getVectorNumElements();
12313  if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
12314    SDValue DstVec = DstIsLeft ? V1 : V2;
12315    SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);
12316
12317    SDValue SrcVec = V1;
12318    int SrcLane = ShuffleMask[Anomaly];
12319    if (SrcLane >= NumInputElements) {
12320      SrcVec = V2;
12321      SrcLane -= VT.getVectorNumElements();
12322    }
12323    SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);
12324
12325    EVT ScalarVT = VT.getVectorElementType();
12326
12327    if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
12328      ScalarVT = MVT::i32;
12329
12330    return DAG.getNode(
12331        ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
12332        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
12333        DstLaneV);
12334  }
12335
12336  if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
12337    return NewSD;
12338
12339  // If the shuffle is not directly supported and it has 4 elements, use
12340  // the PerfectShuffle-generated table to synthesize it from other shuffles.
12341  unsigned NumElts = VT.getVectorNumElements();
12342  if (NumElts == 4) {
12343    unsigned PFIndexes[4];
12344    for (unsigned i = 0; i != 4; ++i) {
12345      if (ShuffleMask[i] < 0)
12346        PFIndexes[i] = 8;
12347      else
12348        PFIndexes[i] = ShuffleMask[i];
12349    }
12350
12351    // Compute the index in the perfect shuffle table.
12352    unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
12353                            PFIndexes[2] * 9 + PFIndexes[3];
12354    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
12355    return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
12356                                  dl);
12357  }
12358
12359  return GenerateTBL(Op, ShuffleMask, DAG);
12360}
12361
12362SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
12363                                                 SelectionDAG &DAG) const {
12364  EVT VT = Op.getValueType();
12365
12366  if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
12367    return LowerToScalableOp(Op, DAG);
12368
12369  assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
12370         "Unexpected vector type!");
12371
12372  // We can handle the constant cases during isel.
12373  if (isa<ConstantSDNode>(Op.getOperand(0)))
12374    return Op;
12375
12376  // There isn't a natural way to handle the general i1 case, so we use some
12377  // trickery with whilelo.
12378  SDLoc DL(Op);
12379  SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64);
12380  SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,
12381                         DAG.getValueType(MVT::i1));
12382  SDValue ID =
12383      DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
12384  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
12385  if (VT == MVT::nxv1i1)
12386    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,
12387                       DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,
12388                                   Zero, SplatVal),
12389                       Zero);
12390  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal);
12391}
12392
12393SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
12394                                             SelectionDAG &DAG) const {
12395  SDLoc DL(Op);
12396
12397  EVT VT = Op.getValueType();
12398  if (!isTypeLegal(VT) || !VT.isScalableVector())
12399    return SDValue();
12400
12401  // Current lowering only supports the SVE-ACLE types.
12402  if (VT.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock)
12403    return SDValue();
12404
12405  // The DUPQ operation is indepedent of element type so normalise to i64s.
12406  SDValue Idx128 = Op.getOperand(2);
12407
12408  // DUPQ can be used when idx is in range.
12409  auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
12410  if (CIdx && (CIdx->getZExtValue() <= 3)) {
12411    SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
12412    return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI);
12413  }
12414
12415  SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
12416
12417  // The ACLE says this must produce the same result as:
12418  //   svtbl(data, svadd_x(svptrue_b64(),
12419  //                       svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
12420  //                       index * 2))
12421  SDValue One = DAG.getConstant(1, DL, MVT::i64);
12422  SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
12423
12424  // create the vector 0,1,0,1,...
12425  SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
12426  SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
12427
12428  // create the vector idx64,idx64+1,idx64,idx64+1,...
12429  SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
12430  SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
12431  SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
12432
12433  // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
12434  SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
12435  return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
12436}
12437
12438
12439static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
12440                               APInt &UndefBits) {
12441  EVT VT = BVN->getValueType(0);
12442  APInt SplatBits, SplatUndef;
12443  unsigned SplatBitSize;
12444  bool HasAnyUndefs;
12445  if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
12446    unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
12447
12448    for (unsigned i = 0; i < NumSplats; ++i) {
12449      CnstBits <<= SplatBitSize;
12450      UndefBits <<= SplatBitSize;
12451      CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
12452      UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
12453    }
12454
12455    return true;
12456  }
12457
12458  return false;
12459}
12460
12461// Try 64-bit splatted SIMD immediate.
12462static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12463                                 const APInt &Bits) {
12464  if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12465    uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12466    EVT VT = Op.getValueType();
12467    MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
12468
12469    if (AArch64_AM::isAdvSIMDModImmType10(Value)) {
12470      Value = AArch64_AM::encodeAdvSIMDModImmType10(Value);
12471
12472      SDLoc dl(Op);
12473      SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12474                                DAG.getConstant(Value, dl, MVT::i32));
12475      return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12476    }
12477  }
12478
12479  return SDValue();
12480}
12481
12482// Try 32-bit splatted SIMD immediate.
12483static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12484                                  const APInt &Bits,
12485                                  const SDValue *LHS = nullptr) {
12486  EVT VT = Op.getValueType();
12487  if (VT.isFixedLengthVector() &&
12488      !DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable())
12489    return SDValue();
12490
12491  if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12492    uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12493    MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
12494    bool isAdvSIMDModImm = false;
12495    uint64_t Shift;
12496
12497    if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
12498      Value = AArch64_AM::encodeAdvSIMDModImmType1(Value);
12499      Shift = 0;
12500    }
12501    else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
12502      Value = AArch64_AM::encodeAdvSIMDModImmType2(Value);
12503      Shift = 8;
12504    }
12505    else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
12506      Value = AArch64_AM::encodeAdvSIMDModImmType3(Value);
12507      Shift = 16;
12508    }
12509    else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
12510      Value = AArch64_AM::encodeAdvSIMDModImmType4(Value);
12511      Shift = 24;
12512    }
12513
12514    if (isAdvSIMDModImm) {
12515      SDLoc dl(Op);
12516      SDValue Mov;
12517
12518      if (LHS)
12519        Mov = DAG.getNode(NewOp, dl, MovTy,
12520                          DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS),
12521                          DAG.getConstant(Value, dl, MVT::i32),
12522                          DAG.getConstant(Shift, dl, MVT::i32));
12523      else
12524        Mov = DAG.getNode(NewOp, dl, MovTy,
12525                          DAG.getConstant(Value, dl, MVT::i32),
12526                          DAG.getConstant(Shift, dl, MVT::i32));
12527
12528      return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12529    }
12530  }
12531
12532  return SDValue();
12533}
12534
12535// Try 16-bit splatted SIMD immediate.
12536static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12537                                  const APInt &Bits,
12538                                  const SDValue *LHS = nullptr) {
12539  EVT VT = Op.getValueType();
12540  if (VT.isFixedLengthVector() &&
12541      !DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable())
12542    return SDValue();
12543
12544  if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12545    uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12546    MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
12547    bool isAdvSIMDModImm = false;
12548    uint64_t Shift;
12549
12550    if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
12551      Value = AArch64_AM::encodeAdvSIMDModImmType5(Value);
12552      Shift = 0;
12553    }
12554    else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
12555      Value = AArch64_AM::encodeAdvSIMDModImmType6(Value);
12556      Shift = 8;
12557    }
12558
12559    if (isAdvSIMDModImm) {
12560      SDLoc dl(Op);
12561      SDValue Mov;
12562
12563      if (LHS)
12564        Mov = DAG.getNode(NewOp, dl, MovTy,
12565                          DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS),
12566                          DAG.getConstant(Value, dl, MVT::i32),
12567                          DAG.getConstant(Shift, dl, MVT::i32));
12568      else
12569        Mov = DAG.getNode(NewOp, dl, MovTy,
12570                          DAG.getConstant(Value, dl, MVT::i32),
12571                          DAG.getConstant(Shift, dl, MVT::i32));
12572
12573      return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12574    }
12575  }
12576
12577  return SDValue();
12578}
12579
12580// Try 32-bit splatted SIMD immediate with shifted ones.
12581static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op,
12582                                    SelectionDAG &DAG, const APInt &Bits) {
12583  if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12584    uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12585    EVT VT = Op.getValueType();
12586    MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
12587    bool isAdvSIMDModImm = false;
12588    uint64_t Shift;
12589
12590    if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
12591      Value = AArch64_AM::encodeAdvSIMDModImmType7(Value);
12592      Shift = 264;
12593    }
12594    else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
12595      Value = AArch64_AM::encodeAdvSIMDModImmType8(Value);
12596      Shift = 272;
12597    }
12598
12599    if (isAdvSIMDModImm) {
12600      SDLoc dl(Op);
12601      SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12602                                DAG.getConstant(Value, dl, MVT::i32),
12603                                DAG.getConstant(Shift, dl, MVT::i32));
12604      return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12605    }
12606  }
12607
12608  return SDValue();
12609}
12610
12611// Try 8-bit splatted SIMD immediate.
12612static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12613                                 const APInt &Bits) {
12614  if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12615    uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12616    EVT VT = Op.getValueType();
12617    MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
12618
12619    if (AArch64_AM::isAdvSIMDModImmType9(Value)) {
12620      Value = AArch64_AM::encodeAdvSIMDModImmType9(Value);
12621
12622      SDLoc dl(Op);
12623      SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12624                                DAG.getConstant(Value, dl, MVT::i32));
12625      return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12626    }
12627  }
12628
12629  return SDValue();
12630}
12631
12632// Try FP splatted SIMD immediate.
12633static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12634                                  const APInt &Bits) {
12635  if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12636    uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12637    EVT VT = Op.getValueType();
12638    bool isWide = (VT.getSizeInBits() == 128);
12639    MVT MovTy;
12640    bool isAdvSIMDModImm = false;
12641
12642    if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
12643      Value = AArch64_AM::encodeAdvSIMDModImmType11(Value);
12644      MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
12645    }
12646    else if (isWide &&
12647             (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
12648      Value = AArch64_AM::encodeAdvSIMDModImmType12(Value);
12649      MovTy = MVT::v2f64;
12650    }
12651
12652    if (isAdvSIMDModImm) {
12653      SDLoc dl(Op);
12654      SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12655                                DAG.getConstant(Value, dl, MVT::i32));
12656      return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12657    }
12658  }
12659
12660  return SDValue();
12661}
12662
12663// Specialized code to quickly find if PotentialBVec is a BuildVector that
12664// consists of only the same constant int value, returned in reference arg
12665// ConstVal
12666static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
12667                                     uint64_t &ConstVal) {
12668  BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
12669  if (!Bvec)
12670    return false;
12671  ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
12672  if (!FirstElt)
12673    return false;
12674  EVT VT = Bvec->getValueType(0);
12675  unsigned NumElts = VT.getVectorNumElements();
12676  for (unsigned i = 1; i < NumElts; ++i)
12677    if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
12678      return false;
12679  ConstVal = FirstElt->getZExtValue();
12680  return true;
12681}
12682
12683static bool isAllInactivePredicate(SDValue N) {
12684  // Look through cast.
12685  while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
12686    N = N.getOperand(0);
12687
12688  return ISD::isConstantSplatVectorAllZeros(N.getNode());
12689}
12690
12691static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) {
12692  unsigned NumElts = N.getValueType().getVectorMinNumElements();
12693
12694  // Look through cast.
12695  while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
12696    N = N.getOperand(0);
12697    // When reinterpreting from a type with fewer elements the "new" elements
12698    // are not active, so bail if they're likely to be used.
12699    if (N.getValueType().getVectorMinNumElements() < NumElts)
12700      return false;
12701  }
12702
12703  if (ISD::isConstantSplatVectorAllOnes(N.getNode()))
12704    return true;
12705
12706  // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
12707  // or smaller than the implicit element type represented by N.
12708  // NOTE: A larger element count implies a smaller element type.
12709  if (N.getOpcode() == AArch64ISD::PTRUE &&
12710      N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
12711    return N.getValueType().getVectorMinNumElements() >= NumElts;
12712
12713  // If we're compiling for a specific vector-length, we can check if the
12714  // pattern's VL equals that of the scalable vector at runtime.
12715  if (N.getOpcode() == AArch64ISD::PTRUE) {
12716    const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
12717    unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
12718    unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
12719    if (MaxSVESize && MinSVESize == MaxSVESize) {
12720      unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;
12721      unsigned PatNumElts =
12722          getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0));
12723      return PatNumElts == (NumElts * VScale);
12724    }
12725  }
12726
12727  return false;
12728}
12729
12730// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
12731// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
12732// BUILD_VECTORs with constant element C1, C2 is a constant, and:
12733//   - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
12734//   - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
12735// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
12736static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
12737  EVT VT = N->getValueType(0);
12738
12739  if (!VT.isVector())
12740    return SDValue();
12741
12742  SDLoc DL(N);
12743
12744  SDValue And;
12745  SDValue Shift;
12746
12747  SDValue FirstOp = N->getOperand(0);
12748  unsigned FirstOpc = FirstOp.getOpcode();
12749  SDValue SecondOp = N->getOperand(1);
12750  unsigned SecondOpc = SecondOp.getOpcode();
12751
12752  // Is one of the operands an AND or a BICi? The AND may have been optimised to
12753  // a BICi in order to use an immediate instead of a register.
12754  // Is the other operand an shl or lshr? This will have been turned into:
12755  // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift
12756  // or (AArch64ISD::SHL_PRED || AArch64ISD::SRL_PRED) mask, vector, #shiftVec.
12757  if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
12758      (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR ||
12759       SecondOpc == AArch64ISD::SHL_PRED ||
12760       SecondOpc == AArch64ISD::SRL_PRED)) {
12761    And = FirstOp;
12762    Shift = SecondOp;
12763
12764  } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
12765             (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR ||
12766              FirstOpc == AArch64ISD::SHL_PRED ||
12767              FirstOpc == AArch64ISD::SRL_PRED)) {
12768    And = SecondOp;
12769    Shift = FirstOp;
12770  } else
12771    return SDValue();
12772
12773  bool IsAnd = And.getOpcode() == ISD::AND;
12774  bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR ||
12775                      Shift.getOpcode() == AArch64ISD::SRL_PRED;
12776  bool ShiftHasPredOp = Shift.getOpcode() == AArch64ISD::SHL_PRED ||
12777                        Shift.getOpcode() == AArch64ISD::SRL_PRED;
12778
12779  // Is the shift amount constant and are all lanes active?
12780  uint64_t C2;
12781  if (ShiftHasPredOp) {
12782    if (!isAllActivePredicate(DAG, Shift.getOperand(0)))
12783      return SDValue();
12784    APInt C;
12785    if (!ISD::isConstantSplatVector(Shift.getOperand(2).getNode(), C))
12786      return SDValue();
12787    C2 = C.getZExtValue();
12788  } else if (ConstantSDNode *C2node =
12789                 dyn_cast<ConstantSDNode>(Shift.getOperand(1)))
12790    C2 = C2node->getZExtValue();
12791  else
12792    return SDValue();
12793
12794  APInt C1AsAPInt;
12795  unsigned ElemSizeInBits = VT.getScalarSizeInBits();
12796  if (IsAnd) {
12797    // Is the and mask vector all constant?
12798    if (!ISD::isConstantSplatVector(And.getOperand(1).getNode(), C1AsAPInt))
12799      return SDValue();
12800  } else {
12801    // Reconstruct the corresponding AND immediate from the two BICi immediates.
12802    ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
12803    ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
12804    assert(C1nodeImm && C1nodeShift);
12805    C1AsAPInt = ~(C1nodeImm->getAPIntValue() << C1nodeShift->getAPIntValue());
12806    C1AsAPInt = C1AsAPInt.zextOrTrunc(ElemSizeInBits);
12807  }
12808
12809  // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
12810  // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
12811  // how much one can shift elements of a particular size?
12812  if (C2 > ElemSizeInBits)
12813    return SDValue();
12814
12815  APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
12816                                  : APInt::getLowBitsSet(ElemSizeInBits, C2);
12817  if (C1AsAPInt != RequiredC1)
12818    return SDValue();
12819
12820  SDValue X = And.getOperand(0);
12821  SDValue Y = ShiftHasPredOp ? Shift.getOperand(1) : Shift.getOperand(0);
12822  SDValue Imm = ShiftHasPredOp ? DAG.getTargetConstant(C2, DL, MVT::i32)
12823                               : Shift.getOperand(1);
12824
12825  unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
12826  SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Imm);
12827
12828  LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
12829  LLVM_DEBUG(N->dump(&DAG));
12830  LLVM_DEBUG(dbgs() << "into: \n");
12831  LLVM_DEBUG(ResultSLI->dump(&DAG));
12832
12833  ++NumShiftInserts;
12834  return ResultSLI;
12835}
12836
12837SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
12838                                             SelectionDAG &DAG) const {
12839  if (useSVEForFixedLengthVectorVT(Op.getValueType(),
12840                                   !Subtarget->isNeonAvailable()))
12841    return LowerToScalableOp(Op, DAG);
12842
12843  // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
12844  if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
12845    return Res;
12846
12847  EVT VT = Op.getValueType();
12848  if (VT.isScalableVector())
12849    return Op;
12850
12851  SDValue LHS = Op.getOperand(0);
12852  BuildVectorSDNode *BVN =
12853      dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
12854  if (!BVN) {
12855    // OR commutes, so try swapping the operands.
12856    LHS = Op.getOperand(1);
12857    BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
12858  }
12859  if (!BVN)
12860    return Op;
12861
12862  APInt DefBits(VT.getSizeInBits(), 0);
12863  APInt UndefBits(VT.getSizeInBits(), 0);
12864  if (resolveBuildVector(BVN, DefBits, UndefBits)) {
12865    SDValue NewOp;
12866
12867    if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
12868                                    DefBits, &LHS)) ||
12869        (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
12870                                    DefBits, &LHS)))
12871      return NewOp;
12872
12873    if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
12874                                    UndefBits, &LHS)) ||
12875        (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
12876                                    UndefBits, &LHS)))
12877      return NewOp;
12878  }
12879
12880  // We can always fall back to a non-immediate OR.
12881  return Op;
12882}
12883
12884// Normalize the operands of BUILD_VECTOR. The value of constant operands will
12885// be truncated to fit element width.
12886static SDValue NormalizeBuildVector(SDValue Op,
12887                                    SelectionDAG &DAG) {
12888  assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
12889  SDLoc dl(Op);
12890  EVT VT = Op.getValueType();
12891  EVT EltTy= VT.getVectorElementType();
12892
12893  if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
12894    return Op;
12895
12896  SmallVector<SDValue, 16> Ops;
12897  for (SDValue Lane : Op->ops()) {
12898    // For integer vectors, type legalization would have promoted the
12899    // operands already. Otherwise, if Op is a floating-point splat
12900    // (with operands cast to integers), then the only possibilities
12901    // are constants and UNDEFs.
12902    if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
12903      APInt LowBits(EltTy.getSizeInBits(),
12904                    CstLane->getZExtValue());
12905      Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
12906    } else if (Lane.getNode()->isUndef()) {
12907      Lane = DAG.getUNDEF(MVT::i32);
12908    } else {
12909      assert(Lane.getValueType() == MVT::i32 &&
12910             "Unexpected BUILD_VECTOR operand type");
12911    }
12912    Ops.push_back(Lane);
12913  }
12914  return DAG.getBuildVector(VT, dl, Ops);
12915}
12916
12917static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG) {
12918  EVT VT = Op.getValueType();
12919
12920  APInt DefBits(VT.getSizeInBits(), 0);
12921  APInt UndefBits(VT.getSizeInBits(), 0);
12922  BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
12923  if (resolveBuildVector(BVN, DefBits, UndefBits)) {
12924    SDValue NewOp;
12925    if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
12926        (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
12927        (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
12928        (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
12929        (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
12930        (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
12931      return NewOp;
12932
12933    DefBits = ~DefBits;
12934    if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) ||
12935        (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) ||
12936        (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
12937      return NewOp;
12938
12939    DefBits = UndefBits;
12940    if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
12941        (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
12942        (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
12943        (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
12944        (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
12945        (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
12946      return NewOp;
12947
12948    DefBits = ~UndefBits;
12949    if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) ||
12950        (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) ||
12951        (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
12952      return NewOp;
12953  }
12954
12955  return SDValue();
12956}
12957
12958SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
12959                                                 SelectionDAG &DAG) const {
12960  EVT VT = Op.getValueType();
12961
12962  if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
12963    if (auto SeqInfo = cast<BuildVectorSDNode>(Op)->isConstantSequence()) {
12964      SDLoc DL(Op);
12965      EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
12966      SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
12967      SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
12968      SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
12969      return convertFromScalableVector(DAG, Op.getValueType(), Seq);
12970    }
12971
12972    // Revert to common legalisation for all other variants.
12973    return SDValue();
12974  }
12975
12976  // Try to build a simple constant vector.
12977  Op = NormalizeBuildVector(Op, DAG);
12978  // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
12979  // abort.
12980  if (Op.getOpcode() != ISD::BUILD_VECTOR)
12981    return SDValue();
12982
12983  // Certain vector constants, used to express things like logical NOT and
12984  // arithmetic NEG, are passed through unmodified.  This allows special
12985  // patterns for these operations to match, which will lower these constants
12986  // to whatever is proven necessary.
12987  BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
12988  if (BVN->isConstant()) {
12989    if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
12990      unsigned BitSize = VT.getVectorElementType().getSizeInBits();
12991      APInt Val(BitSize,
12992                Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
12993      if (Val.isZero() || (VT.isInteger() && Val.isAllOnes()))
12994        return Op;
12995    }
12996    if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode())
12997      if (Const->isZero() && !Const->isNegative())
12998        return Op;
12999  }
13000
13001  if (SDValue V = ConstantBuildVector(Op, DAG))
13002    return V;
13003
13004  // Scan through the operands to find some interesting properties we can
13005  // exploit:
13006  //   1) If only one value is used, we can use a DUP, or
13007  //   2) if only the low element is not undef, we can just insert that, or
13008  //   3) if only one constant value is used (w/ some non-constant lanes),
13009  //      we can splat the constant value into the whole vector then fill
13010  //      in the non-constant lanes.
13011  //   4) FIXME: If different constant values are used, but we can intelligently
13012  //             select the values we'll be overwriting for the non-constant
13013  //             lanes such that we can directly materialize the vector
13014  //             some other way (MOVI, e.g.), we can be sneaky.
13015  //   5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
13016  SDLoc dl(Op);
13017  unsigned NumElts = VT.getVectorNumElements();
13018  bool isOnlyLowElement = true;
13019  bool usesOnlyOneValue = true;
13020  bool usesOnlyOneConstantValue = true;
13021  bool isConstant = true;
13022  bool AllLanesExtractElt = true;
13023  unsigned NumConstantLanes = 0;
13024  unsigned NumDifferentLanes = 0;
13025  unsigned NumUndefLanes = 0;
13026  SDValue Value;
13027  SDValue ConstantValue;
13028  SmallMapVector<SDValue, unsigned, 16> DifferentValueMap;
13029  unsigned ConsecutiveValCount = 0;
13030  SDValue PrevVal;
13031  for (unsigned i = 0; i < NumElts; ++i) {
13032    SDValue V = Op.getOperand(i);
13033    if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13034      AllLanesExtractElt = false;
13035    if (V.isUndef()) {
13036      ++NumUndefLanes;
13037      continue;
13038    }
13039    if (i > 0)
13040      isOnlyLowElement = false;
13041    if (!isIntOrFPConstant(V))
13042      isConstant = false;
13043
13044    if (isIntOrFPConstant(V)) {
13045      ++NumConstantLanes;
13046      if (!ConstantValue.getNode())
13047        ConstantValue = V;
13048      else if (ConstantValue != V)
13049        usesOnlyOneConstantValue = false;
13050    }
13051
13052    if (!Value.getNode())
13053      Value = V;
13054    else if (V != Value) {
13055      usesOnlyOneValue = false;
13056      ++NumDifferentLanes;
13057    }
13058
13059    if (PrevVal != V) {
13060      ConsecutiveValCount = 0;
13061      PrevVal = V;
13062    }
13063
13064    // Keep different values and its last consecutive count. For example,
13065    //
13066    //  t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
13067    //                            t24, t24, t24, t24, t24, t24, t24, t24
13068    //  t23 = consecutive count 8
13069    //  t24 = consecutive count 8
13070    // ------------------------------------------------------------------
13071    //  t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24,
13072    //                            t24, t24, t24, t24, t24, t24, t24, t24
13073    //  t23 = consecutive count 5
13074    //  t24 = consecutive count 9
13075    DifferentValueMap[V] = ++ConsecutiveValCount;
13076  }
13077
13078  if (!Value.getNode()) {
13079    LLVM_DEBUG(
13080        dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
13081    return DAG.getUNDEF(VT);
13082  }
13083
13084  // Convert BUILD_VECTOR where all elements but the lowest are undef into
13085  // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
13086  // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
13087  if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
13088    LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
13089                         "SCALAR_TO_VECTOR node\n");
13090    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
13091  }
13092
13093  if (AllLanesExtractElt) {
13094    SDNode *Vector = nullptr;
13095    bool Even = false;
13096    bool Odd = false;
13097    // Check whether the extract elements match the Even pattern <0,2,4,...> or
13098    // the Odd pattern <1,3,5,...>.
13099    for (unsigned i = 0; i < NumElts; ++i) {
13100      SDValue V = Op.getOperand(i);
13101      const SDNode *N = V.getNode();
13102      if (!isa<ConstantSDNode>(N->getOperand(1))) {
13103        Even = false;
13104        Odd = false;
13105        break;
13106      }
13107      SDValue N0 = N->getOperand(0);
13108
13109      // All elements are extracted from the same vector.
13110      if (!Vector) {
13111        Vector = N0.getNode();
13112        // Check that the type of EXTRACT_VECTOR_ELT matches the type of
13113        // BUILD_VECTOR.
13114        if (VT.getVectorElementType() !=
13115            N0.getValueType().getVectorElementType())
13116          break;
13117      } else if (Vector != N0.getNode()) {
13118        Odd = false;
13119        Even = false;
13120        break;
13121      }
13122
13123      // Extracted values are either at Even indices <0,2,4,...> or at Odd
13124      // indices <1,3,5,...>.
13125      uint64_t Val = N->getConstantOperandVal(1);
13126      if (Val == 2 * i) {
13127        Even = true;
13128        continue;
13129      }
13130      if (Val - 1 == 2 * i) {
13131        Odd = true;
13132        continue;
13133      }
13134
13135      // Something does not match: abort.
13136      Odd = false;
13137      Even = false;
13138      break;
13139    }
13140    if (Even || Odd) {
13141      SDValue LHS =
13142          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0),
13143                      DAG.getConstant(0, dl, MVT::i64));
13144      SDValue RHS =
13145          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0),
13146                      DAG.getConstant(NumElts, dl, MVT::i64));
13147
13148      if (Even && !Odd)
13149        return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS,
13150                           RHS);
13151      if (Odd && !Even)
13152        return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS,
13153                           RHS);
13154    }
13155  }
13156
13157  // Use DUP for non-constant splats. For f32 constant splats, reduce to
13158  // i32 and try again.
13159  if (usesOnlyOneValue) {
13160    if (!isConstant) {
13161      if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13162          Value.getValueType() != VT) {
13163        LLVM_DEBUG(
13164            dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
13165        return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
13166      }
13167
13168      // This is actually a DUPLANExx operation, which keeps everything vectory.
13169
13170      SDValue Lane = Value.getOperand(1);
13171      Value = Value.getOperand(0);
13172      if (Value.getValueSizeInBits() == 64) {
13173        LLVM_DEBUG(
13174            dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
13175                      "widening it\n");
13176        Value = WidenVector(Value, DAG);
13177      }
13178
13179      unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
13180      return DAG.getNode(Opcode, dl, VT, Value, Lane);
13181    }
13182
13183    if (VT.getVectorElementType().isFloatingPoint()) {
13184      SmallVector<SDValue, 8> Ops;
13185      EVT EltTy = VT.getVectorElementType();
13186      assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
13187               EltTy == MVT::f64) && "Unsupported floating-point vector type");
13188      LLVM_DEBUG(
13189          dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
13190                    "BITCASTS, and try again\n");
13191      MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
13192      for (unsigned i = 0; i < NumElts; ++i)
13193        Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
13194      EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
13195      SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
13196      LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
13197                 Val.dump(););
13198      Val = LowerBUILD_VECTOR(Val, DAG);
13199      if (Val.getNode())
13200        return DAG.getNode(ISD::BITCAST, dl, VT, Val);
13201    }
13202  }
13203
13204  // If we need to insert a small number of different non-constant elements and
13205  // the vector width is sufficiently large, prefer using DUP with the common
13206  // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
13207  // skip the constant lane handling below.
13208  bool PreferDUPAndInsert =
13209      !isConstant && NumDifferentLanes >= 1 &&
13210      NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
13211      NumDifferentLanes >= NumConstantLanes;
13212
13213  // If there was only one constant value used and for more than one lane,
13214  // start by splatting that value, then replace the non-constant lanes. This
13215  // is better than the default, which will perform a separate initialization
13216  // for each lane.
13217  if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
13218    // Firstly, try to materialize the splat constant.
13219    SDValue Val = DAG.getSplatBuildVector(VT, dl, ConstantValue);
13220    unsigned BitSize = VT.getScalarSizeInBits();
13221    APInt ConstantValueAPInt(1, 0);
13222    if (auto *C = dyn_cast<ConstantSDNode>(ConstantValue))
13223      ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(BitSize);
13224    if (!isNullConstant(ConstantValue) && !isNullFPConstant(ConstantValue) &&
13225        !ConstantValueAPInt.isAllOnes()) {
13226      Val = ConstantBuildVector(Val, DAG);
13227      if (!Val)
13228        // Otherwise, materialize the constant and splat it.
13229        Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
13230    }
13231
13232    // Now insert the non-constant lanes.
13233    for (unsigned i = 0; i < NumElts; ++i) {
13234      SDValue V = Op.getOperand(i);
13235      SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
13236      if (!isIntOrFPConstant(V))
13237        // Note that type legalization likely mucked about with the VT of the
13238        // source operand, so we may have to convert it here before inserting.
13239        Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
13240    }
13241    return Val;
13242  }
13243
13244  // This will generate a load from the constant pool.
13245  if (isConstant) {
13246    LLVM_DEBUG(
13247        dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
13248                  "expansion\n");
13249    return SDValue();
13250  }
13251
13252  // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
13253  // v4i32s. This is really a truncate, which we can construct out of (legal)
13254  // concats and truncate nodes.
13255  if (SDValue M = ReconstructTruncateFromBuildVector(Op, DAG))
13256    return M;
13257
13258  // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
13259  if (NumElts >= 4) {
13260    if (SDValue Shuffle = ReconstructShuffle(Op, DAG))
13261      return Shuffle;
13262
13263    if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG))
13264      return Shuffle;
13265  }
13266
13267  if (PreferDUPAndInsert) {
13268    // First, build a constant vector with the common element.
13269    SmallVector<SDValue, 8> Ops(NumElts, Value);
13270    SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG);
13271    // Next, insert the elements that do not match the common value.
13272    for (unsigned I = 0; I < NumElts; ++I)
13273      if (Op.getOperand(I) != Value)
13274        NewVector =
13275            DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVector,
13276                        Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64));
13277
13278    return NewVector;
13279  }
13280
13281  // If vector consists of two different values, try to generate two DUPs and
13282  // (CONCAT_VECTORS or VECTOR_SHUFFLE).
13283  if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) {
13284    SmallVector<SDValue, 2> Vals;
13285    // Check the consecutive count of the value is the half number of vector
13286    // elements. In this case, we can use CONCAT_VECTORS. For example,
13287    //
13288    // canUseVECTOR_CONCAT = true;
13289    //  t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
13290    //                            t24, t24, t24, t24, t24, t24, t24, t24
13291    //
13292    // canUseVECTOR_CONCAT = false;
13293    //  t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24,
13294    //                            t24, t24, t24, t24, t24, t24, t24, t24
13295    bool canUseVECTOR_CONCAT = true;
13296    for (auto Pair : DifferentValueMap) {
13297      // Check different values have same length which is NumElts / 2.
13298      if (Pair.second != NumElts / 2)
13299        canUseVECTOR_CONCAT = false;
13300      Vals.push_back(Pair.first);
13301    }
13302
13303    // If canUseVECTOR_CONCAT is true, we can generate two DUPs and
13304    // CONCAT_VECTORs. For example,
13305    //
13306    //  t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
13307    //                            t24, t24, t24, t24, t24, t24, t24, t24
13308    // ==>
13309    //    t26: v8i8 = AArch64ISD::DUP t23
13310    //    t28: v8i8 = AArch64ISD::DUP t24
13311    //  t29: v16i8 = concat_vectors t26, t28
13312    if (canUseVECTOR_CONCAT) {
13313      EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
13314      if (isTypeLegal(SubVT) && SubVT.isVector() &&
13315          SubVT.getVectorNumElements() >= 2) {
13316        SmallVector<SDValue, 8> Ops1(NumElts / 2, Vals[0]);
13317        SmallVector<SDValue, 8> Ops2(NumElts / 2, Vals[1]);
13318        SDValue DUP1 =
13319            LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops1), DAG);
13320        SDValue DUP2 =
13321            LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops2), DAG);
13322        SDValue CONCAT_VECTORS =
13323            DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DUP1, DUP2);
13324        return CONCAT_VECTORS;
13325      }
13326    }
13327
13328    // Let's try to generate VECTOR_SHUFFLE. For example,
13329    //
13330    //  t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26
13331    //  ==>
13332    //    t27: v8i8 = BUILD_VECTOR t26, t26, t26, t26, t26, t26, t26, t26
13333    //    t28: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t25, t25, t25, t25
13334    //  t29: v8i8 = vector_shuffle<0,1,2,3,12,13,14,15> t27, t28
13335    if (NumElts >= 8) {
13336      SmallVector<int, 16> MaskVec;
13337      // Build mask for VECTOR_SHUFLLE.
13338      SDValue FirstLaneVal = Op.getOperand(0);
13339      for (unsigned i = 0; i < NumElts; ++i) {
13340        SDValue Val = Op.getOperand(i);
13341        if (FirstLaneVal == Val)
13342          MaskVec.push_back(i);
13343        else
13344          MaskVec.push_back(i + NumElts);
13345      }
13346
13347      SmallVector<SDValue, 8> Ops1(NumElts, Vals[0]);
13348      SmallVector<SDValue, 8> Ops2(NumElts, Vals[1]);
13349      SDValue VEC1 = DAG.getBuildVector(VT, dl, Ops1);
13350      SDValue VEC2 = DAG.getBuildVector(VT, dl, Ops2);
13351      SDValue VECTOR_SHUFFLE =
13352          DAG.getVectorShuffle(VT, dl, VEC1, VEC2, MaskVec);
13353      return VECTOR_SHUFFLE;
13354    }
13355  }
13356
13357  // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
13358  // know the default expansion would otherwise fall back on something even
13359  // worse. For a vector with one or two non-undef values, that's
13360  // scalar_to_vector for the elements followed by a shuffle (provided the
13361  // shuffle is valid for the target) and materialization element by element
13362  // on the stack followed by a load for everything else.
13363  if (!isConstant && !usesOnlyOneValue) {
13364    LLVM_DEBUG(
13365        dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
13366                  "of INSERT_VECTOR_ELT\n");
13367
13368    SDValue Vec = DAG.getUNDEF(VT);
13369    SDValue Op0 = Op.getOperand(0);
13370    unsigned i = 0;
13371
13372    // Use SCALAR_TO_VECTOR for lane zero to
13373    // a) Avoid a RMW dependency on the full vector register, and
13374    // b) Allow the register coalescer to fold away the copy if the
13375    //    value is already in an S or D register, and we're forced to emit an
13376    //    INSERT_SUBREG that we can't fold anywhere.
13377    //
13378    // We also allow types like i8 and i16 which are illegal scalar but legal
13379    // vector element types. After type-legalization the inserted value is
13380    // extended (i32) and it is safe to cast them to the vector type by ignoring
13381    // the upper bits of the lowest lane (e.g. v8i8, v4i16).
13382    if (!Op0.isUndef()) {
13383      LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
13384      Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
13385      ++i;
13386    }
13387    LLVM_DEBUG(if (i < NumElts) dbgs()
13388                   << "Creating nodes for the other vector elements:\n";);
13389    for (; i < NumElts; ++i) {
13390      SDValue V = Op.getOperand(i);
13391      if (V.isUndef())
13392        continue;
13393      SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
13394      Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
13395    }
13396    return Vec;
13397  }
13398
13399  LLVM_DEBUG(
13400      dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
13401                "better alternative\n");
13402  return SDValue();
13403}
13404
13405SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
13406                                                   SelectionDAG &DAG) const {
13407  if (useSVEForFixedLengthVectorVT(Op.getValueType(),
13408                                   !Subtarget->isNeonAvailable()))
13409    return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
13410
13411  assert(Op.getValueType().isScalableVector() &&
13412         isTypeLegal(Op.getValueType()) &&
13413         "Expected legal scalable vector type!");
13414
13415  if (isTypeLegal(Op.getOperand(0).getValueType())) {
13416    unsigned NumOperands = Op->getNumOperands();
13417    assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
13418           "Unexpected number of operands in CONCAT_VECTORS");
13419
13420    if (NumOperands == 2)
13421      return Op;
13422
13423    // Concat each pair of subvectors and pack into the lower half of the array.
13424    SmallVector<SDValue> ConcatOps(Op->op_begin(), Op->op_end());
13425    while (ConcatOps.size() > 1) {
13426      for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
13427        SDValue V1 = ConcatOps[I];
13428        SDValue V2 = ConcatOps[I + 1];
13429        EVT SubVT = V1.getValueType();
13430        EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext());
13431        ConcatOps[I / 2] =
13432            DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2);
13433      }
13434      ConcatOps.resize(ConcatOps.size() / 2);
13435    }
13436    return ConcatOps[0];
13437  }
13438
13439  return SDValue();
13440}
13441
13442SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
13443                                                      SelectionDAG &DAG) const {
13444  assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
13445
13446  if (useSVEForFixedLengthVectorVT(Op.getValueType(),
13447                                   !Subtarget->isNeonAvailable()))
13448    return LowerFixedLengthInsertVectorElt(Op, DAG);
13449
13450  EVT VT = Op.getOperand(0).getValueType();
13451
13452  if (VT.getScalarType() == MVT::i1) {
13453    EVT VectorVT = getPromotedVTForPredicate(VT);
13454    SDLoc DL(Op);
13455    SDValue ExtendedVector =
13456        DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT);
13457    SDValue ExtendedValue =
13458        DAG.getAnyExtOrTrunc(Op.getOperand(1), DL,
13459                             VectorVT.getScalarType().getSizeInBits() < 32
13460                                 ? MVT::i32
13461                                 : VectorVT.getScalarType());
13462    ExtendedVector =
13463        DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector,
13464                    ExtendedValue, Op.getOperand(2));
13465    return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT);
13466  }
13467
13468  // Check for non-constant or out of range lane.
13469  ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
13470  if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
13471    return SDValue();
13472
13473  return Op;
13474}
13475
13476SDValue
13477AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
13478                                               SelectionDAG &DAG) const {
13479  assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
13480  EVT VT = Op.getOperand(0).getValueType();
13481
13482  if (VT.getScalarType() == MVT::i1) {
13483    // We can't directly extract from an SVE predicate; extend it first.
13484    // (This isn't the only possible lowering, but it's straightforward.)
13485    EVT VectorVT = getPromotedVTForPredicate(VT);
13486    SDLoc DL(Op);
13487    SDValue Extend =
13488        DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));
13489    MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
13490    SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy,
13491                                  Extend, Op.getOperand(1));
13492    return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
13493  }
13494
13495  if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
13496    return LowerFixedLengthExtractVectorElt(Op, DAG);
13497
13498  // Check for non-constant or out of range lane.
13499  ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
13500  if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
13501    return SDValue();
13502
13503  // Insertion/extraction are legal for V128 types.
13504  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
13505      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
13506      VT == MVT::v8f16 || VT == MVT::v8bf16)
13507    return Op;
13508
13509  if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
13510      VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
13511      VT != MVT::v4bf16)
13512    return SDValue();
13513
13514  // For V64 types, we perform extraction by expanding the value
13515  // to a V128 type and perform the extraction on that.
13516  SDLoc DL(Op);
13517  SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
13518  EVT WideTy = WideVec.getValueType();
13519
13520  EVT ExtrTy = WideTy.getVectorElementType();
13521  if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
13522    ExtrTy = MVT::i32;
13523
13524  // For extractions, we just return the result directly.
13525  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
13526                     Op.getOperand(1));
13527}
13528
13529SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
13530                                                      SelectionDAG &DAG) const {
13531  assert(Op.getValueType().isFixedLengthVector() &&
13532         "Only cases that extract a fixed length vector are supported!");
13533
13534  EVT InVT = Op.getOperand(0).getValueType();
13535  unsigned Idx = Op.getConstantOperandVal(1);
13536  unsigned Size = Op.getValueSizeInBits();
13537
13538  // If we don't have legal types yet, do nothing
13539  if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT))
13540    return SDValue();
13541
13542  if (InVT.isScalableVector()) {
13543    // This will be matched by custom code during ISelDAGToDAG.
13544    if (Idx == 0 && isPackedVectorType(InVT, DAG))
13545      return Op;
13546
13547    return SDValue();
13548  }
13549
13550  // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
13551  if (Idx == 0 && InVT.getSizeInBits() <= 128)
13552    return Op;
13553
13554  // If this is extracting the upper 64-bits of a 128-bit vector, we match
13555  // that directly.
13556  if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64 &&
13557      InVT.getSizeInBits() == 128 && Subtarget->isNeonAvailable())
13558    return Op;
13559
13560  if (useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) {
13561    SDLoc DL(Op);
13562
13563    EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
13564    SDValue NewInVec =
13565        convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
13566
13567    SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ContainerVT, NewInVec,
13568                                 NewInVec, DAG.getConstant(Idx, DL, MVT::i64));
13569    return convertFromScalableVector(DAG, Op.getValueType(), Splice);
13570  }
13571
13572  return SDValue();
13573}
13574
13575SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
13576                                                     SelectionDAG &DAG) const {
13577  assert(Op.getValueType().isScalableVector() &&
13578         "Only expect to lower inserts into scalable vectors!");
13579
13580  EVT InVT = Op.getOperand(1).getValueType();
13581  unsigned Idx = Op.getConstantOperandVal(2);
13582
13583  SDValue Vec0 = Op.getOperand(0);
13584  SDValue Vec1 = Op.getOperand(1);
13585  SDLoc DL(Op);
13586  EVT VT = Op.getValueType();
13587
13588  if (InVT.isScalableVector()) {
13589    if (!isTypeLegal(VT))
13590      return SDValue();
13591
13592    // Break down insert_subvector into simpler parts.
13593    if (VT.getVectorElementType() == MVT::i1) {
13594      unsigned NumElts = VT.getVectorMinNumElements();
13595      EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
13596
13597      SDValue Lo, Hi;
13598      Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
13599                       DAG.getVectorIdxConstant(0, DL));
13600      Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
13601                       DAG.getVectorIdxConstant(NumElts / 2, DL));
13602      if (Idx < (NumElts / 2)) {
13603        SDValue NewLo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1,
13604                                    DAG.getVectorIdxConstant(Idx, DL));
13605        return DAG.getNode(AArch64ISD::UZP1, DL, VT, NewLo, Hi);
13606      } else {
13607        SDValue NewHi =
13608            DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1,
13609                        DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL));
13610        return DAG.getNode(AArch64ISD::UZP1, DL, VT, Lo, NewHi);
13611      }
13612    }
13613
13614    // Ensure the subvector is half the size of the main vector.
13615    if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
13616      return SDValue();
13617
13618    // Here narrow and wide refers to the vector element types. After "casting"
13619    // both vectors must have the same bit length and so because the subvector
13620    // has fewer elements, those elements need to be bigger.
13621    EVT NarrowVT = getPackedSVEVectorVT(VT.getVectorElementCount());
13622    EVT WideVT = getPackedSVEVectorVT(InVT.getVectorElementCount());
13623
13624    // NOP cast operands to the largest legal vector of the same element count.
13625    if (VT.isFloatingPoint()) {
13626      Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG);
13627      Vec1 = getSVESafeBitCast(WideVT, Vec1, DAG);
13628    } else {
13629      // Legal integer vectors are already their largest so Vec0 is fine as is.
13630      Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
13631    }
13632
13633    // To replace the top/bottom half of vector V with vector SubV we widen the
13634    // preserved half of V, concatenate this to SubV (the order depending on the
13635    // half being replaced) and then narrow the result.
13636    SDValue Narrow;
13637    if (Idx == 0) {
13638      SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
13639      Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0);
13640    } else {
13641      assert(Idx == InVT.getVectorMinNumElements() &&
13642             "Invalid subvector index!");
13643      SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
13644      Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1);
13645    }
13646
13647    return getSVESafeBitCast(VT, Narrow, DAG);
13648  }
13649
13650  if (Idx == 0 && isPackedVectorType(VT, DAG)) {
13651    // This will be matched by custom code during ISelDAGToDAG.
13652    if (Vec0.isUndef())
13653      return Op;
13654
13655    std::optional<unsigned> PredPattern =
13656        getSVEPredPatternFromNumElements(InVT.getVectorNumElements());
13657    auto PredTy = VT.changeVectorElementType(MVT::i1);
13658    SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern);
13659    SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);
13660    return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);
13661  }
13662
13663  return SDValue();
13664}
13665
13666static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
13667  if (Op.getOpcode() != AArch64ISD::DUP &&
13668      Op.getOpcode() != ISD::SPLAT_VECTOR &&
13669      Op.getOpcode() != ISD::BUILD_VECTOR)
13670    return false;
13671
13672  if (Op.getOpcode() == ISD::BUILD_VECTOR &&
13673      !isAllConstantBuildVector(Op, SplatVal))
13674    return false;
13675
13676  if (Op.getOpcode() != ISD::BUILD_VECTOR &&
13677      !isa<ConstantSDNode>(Op->getOperand(0)))
13678    return false;
13679
13680  SplatVal = Op->getConstantOperandVal(0);
13681  if (Op.getValueType().getVectorElementType() != MVT::i64)
13682    SplatVal = (int32_t)SplatVal;
13683
13684  Negated = false;
13685  if (isPowerOf2_64(SplatVal))
13686    return true;
13687
13688  Negated = true;
13689  if (isPowerOf2_64(-SplatVal)) {
13690    SplatVal = -SplatVal;
13691    return true;
13692  }
13693
13694  return false;
13695}
13696
13697SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
13698  EVT VT = Op.getValueType();
13699  SDLoc dl(Op);
13700
13701  if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
13702    return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
13703
13704  assert(VT.isScalableVector() && "Expected a scalable vector.");
13705
13706  bool Signed = Op.getOpcode() == ISD::SDIV;
13707  unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
13708
13709  bool Negated;
13710  uint64_t SplatVal;
13711  if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
13712    SDValue Pg = getPredicateForScalableVector(DAG, dl, VT);
13713    SDValue Res =
13714        DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, VT, Pg, Op->getOperand(0),
13715                    DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32));
13716    if (Negated)
13717      Res = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), Res);
13718
13719    return Res;
13720  }
13721
13722  if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
13723    return LowerToPredicatedOp(Op, DAG, PredOpcode);
13724
13725  // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
13726  // operations, and truncate the result.
13727  EVT WidenedVT;
13728  if (VT == MVT::nxv16i8)
13729    WidenedVT = MVT::nxv8i16;
13730  else if (VT == MVT::nxv8i16)
13731    WidenedVT = MVT::nxv4i32;
13732  else
13733    llvm_unreachable("Unexpected Custom DIV operation");
13734
13735  unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
13736  unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
13737  SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0));
13738  SDValue Op1Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(1));
13739  SDValue Op0Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(0));
13740  SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1));
13741  SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo);
13742  SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi);
13743  return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi);
13744}
13745
13746bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
13747  // Currently no fixed length shuffles that require SVE are legal.
13748  if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
13749    return false;
13750
13751  if (VT.getVectorNumElements() == 4 &&
13752      (VT.is128BitVector() || VT.is64BitVector())) {
13753    unsigned Cost = getPerfectShuffleCost(M);
13754    if (Cost <= 1)
13755      return true;
13756  }
13757
13758  bool DummyBool;
13759  int DummyInt;
13760  unsigned DummyUnsigned;
13761
13762  return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
13763          isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
13764          isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
13765          // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
13766          isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) ||
13767          isZIPMask(M, VT, DummyUnsigned) ||
13768          isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
13769          isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
13770          isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
13771          isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) ||
13772          isConcatMask(M, VT, VT.getSizeInBits() == 128));
13773}
13774
13775bool AArch64TargetLowering::isVectorClearMaskLegal(ArrayRef<int> M,
13776                                                   EVT VT) const {
13777  // Just delegate to the generic legality, clear masks aren't special.
13778  return isShuffleMaskLegal(M, VT);
13779}
13780
13781/// getVShiftImm - Check if this is a valid build_vector for the immediate
13782/// operand of a vector shift operation, where all the elements of the
13783/// build_vector must have the same constant integer value.
13784static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
13785  // Ignore bit_converts.
13786  while (Op.getOpcode() == ISD::BITCAST)
13787    Op = Op.getOperand(0);
13788  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
13789  APInt SplatBits, SplatUndef;
13790  unsigned SplatBitSize;
13791  bool HasAnyUndefs;
13792  if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
13793                                    HasAnyUndefs, ElementBits) ||
13794      SplatBitSize > ElementBits)
13795    return false;
13796  Cnt = SplatBits.getSExtValue();
13797  return true;
13798}
13799
13800/// isVShiftLImm - Check if this is a valid build_vector for the immediate
13801/// operand of a vector shift left operation.  That value must be in the range:
13802///   0 <= Value < ElementBits for a left shift; or
13803///   0 <= Value <= ElementBits for a long left shift.
13804static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
13805  assert(VT.isVector() && "vector shift count is not a vector type");
13806  int64_t ElementBits = VT.getScalarSizeInBits();
13807  if (!getVShiftImm(Op, ElementBits, Cnt))
13808    return false;
13809  return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
13810}
13811
13812/// isVShiftRImm - Check if this is a valid build_vector for the immediate
13813/// operand of a vector shift right operation. The value must be in the range:
13814///   1 <= Value <= ElementBits for a right shift; or
13815static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
13816  assert(VT.isVector() && "vector shift count is not a vector type");
13817  int64_t ElementBits = VT.getScalarSizeInBits();
13818  if (!getVShiftImm(Op, ElementBits, Cnt))
13819    return false;
13820  return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
13821}
13822
13823SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
13824                                             SelectionDAG &DAG) const {
13825  EVT VT = Op.getValueType();
13826
13827  if (VT.getScalarType() == MVT::i1) {
13828    // Lower i1 truncate to `(x & 1) != 0`.
13829    SDLoc dl(Op);
13830    EVT OpVT = Op.getOperand(0).getValueType();
13831    SDValue Zero = DAG.getConstant(0, dl, OpVT);
13832    SDValue One = DAG.getConstant(1, dl, OpVT);
13833    SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One);
13834    return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE);
13835  }
13836
13837  if (!VT.isVector() || VT.isScalableVector())
13838    return SDValue();
13839
13840  if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
13841                                   !Subtarget->isNeonAvailable()))
13842    return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
13843
13844  return SDValue();
13845}
13846
13847SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
13848                                                      SelectionDAG &DAG) const {
13849  EVT VT = Op.getValueType();
13850  SDLoc DL(Op);
13851  int64_t Cnt;
13852
13853  if (!Op.getOperand(1).getValueType().isVector())
13854    return Op;
13855  unsigned EltSize = VT.getScalarSizeInBits();
13856
13857  switch (Op.getOpcode()) {
13858  case ISD::SHL:
13859    if (VT.isScalableVector() ||
13860        useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
13861      return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
13862
13863    if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
13864      return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
13865                         DAG.getConstant(Cnt, DL, MVT::i32));
13866    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
13867                       DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
13868                                       MVT::i32),
13869                       Op.getOperand(0), Op.getOperand(1));
13870  case ISD::SRA:
13871  case ISD::SRL:
13872    if (VT.isScalableVector() ||
13873        useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
13874      unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
13875                                                : AArch64ISD::SRL_PRED;
13876      return LowerToPredicatedOp(Op, DAG, Opc);
13877    }
13878
13879    // Right shift immediate
13880    if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
13881      unsigned Opc =
13882          (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
13883      return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
13884                         DAG.getConstant(Cnt, DL, MVT::i32));
13885    }
13886
13887    // Right shift register.  Note, there is not a shift right register
13888    // instruction, but the shift left register instruction takes a signed
13889    // value, where negative numbers specify a right shift.
13890    unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
13891                                                : Intrinsic::aarch64_neon_ushl;
13892    // negate the shift amount
13893    SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
13894                                   Op.getOperand(1));
13895    SDValue NegShiftLeft =
13896        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
13897                    DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
13898                    NegShift);
13899    return NegShiftLeft;
13900  }
13901
13902  llvm_unreachable("unexpected shift opcode");
13903}
13904
13905static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
13906                                    AArch64CC::CondCode CC, bool NoNans, EVT VT,
13907                                    const SDLoc &dl, SelectionDAG &DAG) {
13908  EVT SrcVT = LHS.getValueType();
13909  assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
13910         "function only supposed to emit natural comparisons");
13911
13912  APInt SplatValue;
13913  APInt SplatUndef;
13914  unsigned SplatBitSize = 0;
13915  bool HasAnyUndefs;
13916
13917  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
13918  bool IsCnst = BVN && BVN->isConstantSplat(SplatValue, SplatUndef,
13919                                            SplatBitSize, HasAnyUndefs);
13920
13921  bool IsZero = IsCnst && SplatValue == 0;
13922  bool IsOne =
13923      IsCnst && SrcVT.getScalarSizeInBits() == SplatBitSize && SplatValue == 1;
13924  bool IsMinusOne = IsCnst && SplatValue.isAllOnes();
13925
13926  if (SrcVT.getVectorElementType().isFloatingPoint()) {
13927    switch (CC) {
13928    default:
13929      return SDValue();
13930    case AArch64CC::NE: {
13931      SDValue Fcmeq;
13932      if (IsZero)
13933        Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
13934      else
13935        Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
13936      return DAG.getNOT(dl, Fcmeq, VT);
13937    }
13938    case AArch64CC::EQ:
13939      if (IsZero)
13940        return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
13941      return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
13942    case AArch64CC::GE:
13943      if (IsZero)
13944        return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
13945      return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
13946    case AArch64CC::GT:
13947      if (IsZero)
13948        return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
13949      return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
13950    case AArch64CC::LE:
13951      if (!NoNans)
13952        return SDValue();
13953      // If we ignore NaNs then we can use to the LS implementation.
13954      [[fallthrough]];
13955    case AArch64CC::LS:
13956      if (IsZero)
13957        return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
13958      return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
13959    case AArch64CC::LT:
13960      if (!NoNans)
13961        return SDValue();
13962      // If we ignore NaNs then we can use to the MI implementation.
13963      [[fallthrough]];
13964    case AArch64CC::MI:
13965      if (IsZero)
13966        return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
13967      return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
13968    }
13969  }
13970
13971  switch (CC) {
13972  default:
13973    return SDValue();
13974  case AArch64CC::NE: {
13975    SDValue Cmeq;
13976    if (IsZero)
13977      Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
13978    else
13979      Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
13980    return DAG.getNOT(dl, Cmeq, VT);
13981  }
13982  case AArch64CC::EQ:
13983    if (IsZero)
13984      return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
13985    return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
13986  case AArch64CC::GE:
13987    if (IsZero)
13988      return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
13989    return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
13990  case AArch64CC::GT:
13991    if (IsZero)
13992      return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
13993    if (IsMinusOne)
13994      return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS, RHS);
13995    return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
13996  case AArch64CC::LE:
13997    if (IsZero)
13998      return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
13999    return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
14000  case AArch64CC::LS:
14001    return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
14002  case AArch64CC::LO:
14003    return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
14004  case AArch64CC::LT:
14005    if (IsZero)
14006      return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
14007    if (IsOne)
14008      return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
14009    return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
14010  case AArch64CC::HI:
14011    return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
14012  case AArch64CC::HS:
14013    return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
14014  }
14015}
14016
14017SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
14018                                           SelectionDAG &DAG) const {
14019  if (Op.getValueType().isScalableVector())
14020    return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
14021
14022  if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
14023                                   !Subtarget->isNeonAvailable()))
14024    return LowerFixedLengthVectorSetccToSVE(Op, DAG);
14025
14026  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
14027  SDValue LHS = Op.getOperand(0);
14028  SDValue RHS = Op.getOperand(1);
14029  EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
14030  SDLoc dl(Op);
14031
14032  if (LHS.getValueType().getVectorElementType().isInteger()) {
14033    assert(LHS.getValueType() == RHS.getValueType());
14034    AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
14035    SDValue Cmp =
14036        EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
14037    return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
14038  }
14039
14040  const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
14041
14042  // Make v4f16 (only) fcmp operations utilise vector instructions
14043  // v8f16 support will be a litle more complicated
14044  if (!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) {
14045    if (LHS.getValueType().getVectorNumElements() == 4) {
14046      LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
14047      RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
14048      SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
14049      DAG.ReplaceAllUsesWith(Op, NewSetcc);
14050      CmpVT = MVT::v4i32;
14051    } else
14052      return SDValue();
14053  }
14054
14055  assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) ||
14056          LHS.getValueType().getVectorElementType() != MVT::f128);
14057
14058  // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
14059  // clean.  Some of them require two branches to implement.
14060  AArch64CC::CondCode CC1, CC2;
14061  bool ShouldInvert;
14062  changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
14063
14064  bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
14065  SDValue Cmp =
14066      EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
14067  if (!Cmp.getNode())
14068    return SDValue();
14069
14070  if (CC2 != AArch64CC::AL) {
14071    SDValue Cmp2 =
14072        EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
14073    if (!Cmp2.getNode())
14074      return SDValue();
14075
14076    Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
14077  }
14078
14079  Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
14080
14081  if (ShouldInvert)
14082    Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
14083
14084  return Cmp;
14085}
14086
14087static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
14088                                  SelectionDAG &DAG) {
14089  SDValue VecOp = ScalarOp.getOperand(0);
14090  auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
14091  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
14092                     DAG.getConstant(0, DL, MVT::i64));
14093}
14094
14095static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
14096                                      SDLoc DL, SelectionDAG &DAG) {
14097  unsigned ScalarOpcode;
14098  switch (Opcode) {
14099  case ISD::VECREDUCE_AND:
14100    ScalarOpcode = ISD::AND;
14101    break;
14102  case ISD::VECREDUCE_OR:
14103    ScalarOpcode = ISD::OR;
14104    break;
14105  case ISD::VECREDUCE_XOR:
14106    ScalarOpcode = ISD::XOR;
14107    break;
14108  default:
14109    llvm_unreachable("Expected bitwise vector reduction");
14110    return SDValue();
14111  }
14112
14113  EVT VecVT = Vec.getValueType();
14114  assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&
14115         "Expected power-of-2 length vector");
14116
14117  EVT ElemVT = VecVT.getVectorElementType();
14118
14119  SDValue Result;
14120  unsigned NumElems = VecVT.getVectorNumElements();
14121
14122  // Special case for boolean reductions
14123  if (ElemVT == MVT::i1) {
14124    // Split large vectors into smaller ones
14125    if (NumElems > 16) {
14126      SDValue Lo, Hi;
14127      std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
14128      EVT HalfVT = Lo.getValueType();
14129      SDValue HalfVec = DAG.getNode(ScalarOpcode, DL, HalfVT, Lo, Hi);
14130      return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG);
14131    }
14132
14133    // Vectors that are less than 64 bits get widened to neatly fit a 64 bit
14134    // register, so e.g. <4 x i1> gets lowered to <4 x i16>. Sign extending to
14135    // this element size leads to the best codegen, since e.g. setcc results
14136    // might need to be truncated otherwise.
14137    EVT ExtendedVT = MVT::getIntegerVT(std::max(64u / NumElems, 8u));
14138
14139    // any_ext doesn't work with umin/umax, so only use it for uadd.
14140    unsigned ExtendOp =
14141        ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
14142    SDValue Extended = DAG.getNode(
14143        ExtendOp, DL, VecVT.changeVectorElementType(ExtendedVT), Vec);
14144    switch (ScalarOpcode) {
14145    case ISD::AND:
14146      Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);
14147      break;
14148    case ISD::OR:
14149      Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended);
14150      break;
14151    case ISD::XOR:
14152      Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, ExtendedVT, Extended);
14153      break;
14154    default:
14155      llvm_unreachable("Unexpected Opcode");
14156    }
14157
14158    Result = DAG.getAnyExtOrTrunc(Result, DL, MVT::i1);
14159  } else {
14160    // Iteratively split the vector in half and combine using the bitwise
14161    // operation until it fits in a 64 bit register.
14162    while (VecVT.getSizeInBits() > 64) {
14163      SDValue Lo, Hi;
14164      std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
14165      VecVT = Lo.getValueType();
14166      NumElems = VecVT.getVectorNumElements();
14167      Vec = DAG.getNode(ScalarOpcode, DL, VecVT, Lo, Hi);
14168    }
14169
14170    EVT ScalarVT = EVT::getIntegerVT(*DAG.getContext(), VecVT.getSizeInBits());
14171
14172    // Do the remaining work on a scalar since it allows the code generator to
14173    // combine the shift and bitwise operation into one instruction and since
14174    // integer instructions can have higher throughput than vector instructions.
14175    SDValue Scalar = DAG.getBitcast(ScalarVT, Vec);
14176
14177    // Iteratively combine the lower and upper halves of the scalar using the
14178    // bitwise operation, halving the relevant region of the scalar in each
14179    // iteration, until the relevant region is just one element of the original
14180    // vector.
14181    for (unsigned Shift = NumElems / 2; Shift > 0; Shift /= 2) {
14182      SDValue ShiftAmount =
14183          DAG.getConstant(Shift * ElemVT.getSizeInBits(), DL, MVT::i64);
14184      SDValue Shifted =
14185          DAG.getNode(ISD::SRL, DL, ScalarVT, Scalar, ShiftAmount);
14186      Scalar = DAG.getNode(ScalarOpcode, DL, ScalarVT, Scalar, Shifted);
14187    }
14188
14189    Result = DAG.getAnyExtOrTrunc(Scalar, DL, ElemVT);
14190  }
14191
14192  return DAG.getAnyExtOrTrunc(Result, DL, VT);
14193}
14194
14195SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
14196                                              SelectionDAG &DAG) const {
14197  SDValue Src = Op.getOperand(0);
14198
14199  // Try to lower fixed length reductions to SVE.
14200  EVT SrcVT = Src.getValueType();
14201  bool OverrideNEON = !Subtarget->isNeonAvailable() ||
14202                      Op.getOpcode() == ISD::VECREDUCE_AND ||
14203                      Op.getOpcode() == ISD::VECREDUCE_OR ||
14204                      Op.getOpcode() == ISD::VECREDUCE_XOR ||
14205                      Op.getOpcode() == ISD::VECREDUCE_FADD ||
14206                      (Op.getOpcode() != ISD::VECREDUCE_ADD &&
14207                       SrcVT.getVectorElementType() == MVT::i64);
14208  if (SrcVT.isScalableVector() ||
14209      useSVEForFixedLengthVectorVT(
14210          SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
14211
14212    if (SrcVT.getVectorElementType() == MVT::i1)
14213      return LowerPredReductionToSVE(Op, DAG);
14214
14215    switch (Op.getOpcode()) {
14216    case ISD::VECREDUCE_ADD:
14217      return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
14218    case ISD::VECREDUCE_AND:
14219      return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
14220    case ISD::VECREDUCE_OR:
14221      return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
14222    case ISD::VECREDUCE_SMAX:
14223      return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
14224    case ISD::VECREDUCE_SMIN:
14225      return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
14226    case ISD::VECREDUCE_UMAX:
14227      return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
14228    case ISD::VECREDUCE_UMIN:
14229      return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
14230    case ISD::VECREDUCE_XOR:
14231      return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
14232    case ISD::VECREDUCE_FADD:
14233      return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
14234    case ISD::VECREDUCE_FMAX:
14235      return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
14236    case ISD::VECREDUCE_FMIN:
14237      return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
14238    case ISD::VECREDUCE_FMAXIMUM:
14239      return LowerReductionToSVE(AArch64ISD::FMAXV_PRED, Op, DAG);
14240    case ISD::VECREDUCE_FMINIMUM:
14241      return LowerReductionToSVE(AArch64ISD::FMINV_PRED, Op, DAG);
14242    default:
14243      llvm_unreachable("Unhandled fixed length reduction");
14244    }
14245  }
14246
14247  // Lower NEON reductions.
14248  SDLoc dl(Op);
14249  switch (Op.getOpcode()) {
14250  case ISD::VECREDUCE_AND:
14251  case ISD::VECREDUCE_OR:
14252  case ISD::VECREDUCE_XOR:
14253    return getVectorBitwiseReduce(Op.getOpcode(), Op.getOperand(0),
14254                                  Op.getValueType(), dl, DAG);
14255  case ISD::VECREDUCE_ADD:
14256    return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
14257  case ISD::VECREDUCE_SMAX:
14258    return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
14259  case ISD::VECREDUCE_SMIN:
14260    return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
14261  case ISD::VECREDUCE_UMAX:
14262    return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
14263  case ISD::VECREDUCE_UMIN:
14264    return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
14265  default:
14266    llvm_unreachable("Unhandled reduction");
14267  }
14268}
14269
14270SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
14271                                                    SelectionDAG &DAG) const {
14272  auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
14273  // No point replacing if we don't have the relevant instruction/libcall anyway
14274  if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
14275    return SDValue();
14276
14277  // LSE has an atomic load-clear instruction, but not a load-and.
14278  SDLoc dl(Op);
14279  MVT VT = Op.getSimpleValueType();
14280  assert(VT != MVT::i128 && "Handled elsewhere, code replicated.");
14281  SDValue RHS = Op.getOperand(2);
14282  AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
14283  RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS);
14284  return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(),
14285                       Op.getOperand(0), Op.getOperand(1), RHS,
14286                       AN->getMemOperand());
14287}
14288
14289SDValue
14290AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
14291                                                      SelectionDAG &DAG) const {
14292
14293  SDLoc dl(Op);
14294  // Get the inputs.
14295  SDNode *Node = Op.getNode();
14296  SDValue Chain = Op.getOperand(0);
14297  SDValue Size = Op.getOperand(1);
14298  MaybeAlign Align =
14299      cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
14300  EVT VT = Node->getValueType(0);
14301
14302  if (DAG.getMachineFunction().getFunction().hasFnAttribute(
14303          "no-stack-arg-probe")) {
14304    SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
14305    Chain = SP.getValue(1);
14306    SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
14307    if (Align)
14308      SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
14309                       DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
14310    Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
14311    SDValue Ops[2] = {SP, Chain};
14312    return DAG.getMergeValues(Ops, dl);
14313  }
14314
14315  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
14316
14317  EVT PtrVT = getPointerTy(DAG.getDataLayout());
14318  SDValue Callee = DAG.getTargetExternalSymbol(Subtarget->getChkStkName(),
14319                                               PtrVT, 0);
14320
14321  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
14322  const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
14323  if (Subtarget->hasCustomCallingConv())
14324    TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
14325
14326  Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
14327                     DAG.getConstant(4, dl, MVT::i64));
14328  Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
14329  Chain =
14330      DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue),
14331                  Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
14332                  DAG.getRegisterMask(Mask), Chain.getValue(1));
14333  // To match the actual intent better, we should read the output from X15 here
14334  // again (instead of potentially spilling it to the stack), but rereading Size
14335  // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
14336  // here.
14337
14338  Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
14339                     DAG.getConstant(4, dl, MVT::i64));
14340
14341  SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
14342  Chain = SP.getValue(1);
14343  SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
14344  if (Align)
14345    SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
14346                     DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
14347  Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
14348
14349  Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
14350
14351  SDValue Ops[2] = {SP, Chain};
14352  return DAG.getMergeValues(Ops, dl);
14353}
14354
14355SDValue
14356AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
14357                                                     SelectionDAG &DAG) const {
14358  // Get the inputs.
14359  SDNode *Node = Op.getNode();
14360  SDValue Chain = Op.getOperand(0);
14361  SDValue Size = Op.getOperand(1);
14362
14363  MaybeAlign Align =
14364      cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
14365  SDLoc dl(Op);
14366  EVT VT = Node->getValueType(0);
14367
14368  // Construct the new SP value in a GPR.
14369  SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
14370  Chain = SP.getValue(1);
14371  SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
14372  if (Align)
14373    SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
14374                     DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
14375
14376  // Set the real SP to the new value with a probing loop.
14377  Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP);
14378  SDValue Ops[2] = {SP, Chain};
14379  return DAG.getMergeValues(Ops, dl);
14380}
14381
14382SDValue
14383AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
14384                                               SelectionDAG &DAG) const {
14385  MachineFunction &MF = DAG.getMachineFunction();
14386
14387  if (Subtarget->isTargetWindows())
14388    return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
14389  else if (hasInlineStackProbe(MF))
14390    return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
14391  else
14392    return SDValue();
14393}
14394
14395// When x and y are extended, lower:
14396//   avgfloor(x, y) -> (x + y) >> 1
14397//   avgceil(x, y)  -> (x + y + 1) >> 1
14398
14399// Otherwise, lower to:
14400//   avgfloor(x, y) -> (x >> 1) + (y >> 1) + (x & y & 1)
14401//   avgceil(x, y)  -> (x >> 1) + (y >> 1) + ((x || y) & 1)
14402SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
14403                                        unsigned NewOp) const {
14404  if (Subtarget->hasSVE2())
14405    return LowerToPredicatedOp(Op, DAG, NewOp);
14406
14407  SDLoc dl(Op);
14408  SDValue OpA = Op->getOperand(0);
14409  SDValue OpB = Op->getOperand(1);
14410  EVT VT = Op.getValueType();
14411  bool IsCeil =
14412      (Op->getOpcode() == ISD::AVGCEILS || Op->getOpcode() == ISD::AVGCEILU);
14413  bool IsSigned =
14414      (Op->getOpcode() == ISD::AVGFLOORS || Op->getOpcode() == ISD::AVGCEILS);
14415  unsigned ShiftOpc = IsSigned ? ISD::SRA : ISD::SRL;
14416
14417  assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
14418
14419  auto IsZeroExtended = [&DAG](SDValue &Node) {
14420    KnownBits Known = DAG.computeKnownBits(Node, 0);
14421    return Known.Zero.isSignBitSet();
14422  };
14423
14424  auto IsSignExtended = [&DAG](SDValue &Node) {
14425    return (DAG.ComputeNumSignBits(Node, 0) > 1);
14426  };
14427
14428  SDValue ConstantOne = DAG.getConstant(1, dl, VT);
14429  if ((!IsSigned && IsZeroExtended(OpA) && IsZeroExtended(OpB)) ||
14430      (IsSigned && IsSignExtended(OpA) && IsSignExtended(OpB))) {
14431    SDValue Add = DAG.getNode(ISD::ADD, dl, VT, OpA, OpB);
14432    if (IsCeil)
14433      Add = DAG.getNode(ISD::ADD, dl, VT, Add, ConstantOne);
14434    return DAG.getNode(ShiftOpc, dl, VT, Add, ConstantOne);
14435  }
14436
14437  SDValue ShiftOpA = DAG.getNode(ShiftOpc, dl, VT, OpA, ConstantOne);
14438  SDValue ShiftOpB = DAG.getNode(ShiftOpc, dl, VT, OpB, ConstantOne);
14439
14440  SDValue tmp = DAG.getNode(IsCeil ? ISD::OR : ISD::AND, dl, VT, OpA, OpB);
14441  tmp = DAG.getNode(ISD::AND, dl, VT, tmp, ConstantOne);
14442  SDValue Add = DAG.getNode(ISD::ADD, dl, VT, ShiftOpA, ShiftOpB);
14443  return DAG.getNode(ISD::ADD, dl, VT, Add, tmp);
14444}
14445
14446SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
14447                                           SelectionDAG &DAG) const {
14448  EVT VT = Op.getValueType();
14449  assert(VT != MVT::i64 && "Expected illegal VSCALE node");
14450
14451  SDLoc DL(Op);
14452  APInt MulImm = Op.getConstantOperandAPInt(0);
14453  return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
14454                            VT);
14455}
14456
14457/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
14458template <unsigned NumVecs>
14459static bool
14460setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL,
14461              AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI) {
14462  Info.opc = ISD::INTRINSIC_VOID;
14463  // Retrieve EC from first vector argument.
14464  const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
14465  ElementCount EC = VT.getVectorElementCount();
14466#ifndef NDEBUG
14467  // Check the assumption that all input vectors are the same type.
14468  for (unsigned I = 0; I < NumVecs; ++I)
14469    assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
14470           "Invalid type.");
14471#endif
14472  // memVT is `NumVecs * VT`.
14473  Info.memVT = EVT::getVectorVT(CI.getType()->getContext(), VT.getScalarType(),
14474                                EC * NumVecs);
14475  Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1);
14476  Info.offset = 0;
14477  Info.align.reset();
14478  Info.flags = MachineMemOperand::MOStore;
14479  return true;
14480}
14481
14482/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
14483/// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
14484/// specified in the intrinsic calls.
14485bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
14486                                               const CallInst &I,
14487                                               MachineFunction &MF,
14488                                               unsigned Intrinsic) const {
14489  auto &DL = I.getModule()->getDataLayout();
14490  switch (Intrinsic) {
14491  case Intrinsic::aarch64_sve_st2:
14492    return setInfoSVEStN<2>(*this, DL, Info, I);
14493  case Intrinsic::aarch64_sve_st3:
14494    return setInfoSVEStN<3>(*this, DL, Info, I);
14495  case Intrinsic::aarch64_sve_st4:
14496    return setInfoSVEStN<4>(*this, DL, Info, I);
14497  case Intrinsic::aarch64_neon_ld2:
14498  case Intrinsic::aarch64_neon_ld3:
14499  case Intrinsic::aarch64_neon_ld4:
14500  case Intrinsic::aarch64_neon_ld1x2:
14501  case Intrinsic::aarch64_neon_ld1x3:
14502  case Intrinsic::aarch64_neon_ld1x4: {
14503    Info.opc = ISD::INTRINSIC_W_CHAIN;
14504    uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
14505    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
14506    Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
14507    Info.offset = 0;
14508    Info.align.reset();
14509    // volatile loads with NEON intrinsics not supported
14510    Info.flags = MachineMemOperand::MOLoad;
14511    return true;
14512  }
14513  case Intrinsic::aarch64_neon_ld2lane:
14514  case Intrinsic::aarch64_neon_ld3lane:
14515  case Intrinsic::aarch64_neon_ld4lane:
14516  case Intrinsic::aarch64_neon_ld2r:
14517  case Intrinsic::aarch64_neon_ld3r:
14518  case Intrinsic::aarch64_neon_ld4r: {
14519    Info.opc = ISD::INTRINSIC_W_CHAIN;
14520    // ldx return struct with the same vec type
14521    Type *RetTy = I.getType();
14522    auto *StructTy = cast<StructType>(RetTy);
14523    unsigned NumElts = StructTy->getNumElements();
14524    Type *VecTy = StructTy->getElementType(0);
14525    MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
14526    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
14527    Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
14528    Info.offset = 0;
14529    Info.align.reset();
14530    // volatile loads with NEON intrinsics not supported
14531    Info.flags = MachineMemOperand::MOLoad;
14532    return true;
14533  }
14534  case Intrinsic::aarch64_neon_st2:
14535  case Intrinsic::aarch64_neon_st3:
14536  case Intrinsic::aarch64_neon_st4:
14537  case Intrinsic::aarch64_neon_st1x2:
14538  case Intrinsic::aarch64_neon_st1x3:
14539  case Intrinsic::aarch64_neon_st1x4: {
14540    Info.opc = ISD::INTRINSIC_VOID;
14541    unsigned NumElts = 0;
14542    for (const Value *Arg : I.args()) {
14543      Type *ArgTy = Arg->getType();
14544      if (!ArgTy->isVectorTy())
14545        break;
14546      NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
14547    }
14548    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
14549    Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
14550    Info.offset = 0;
14551    Info.align.reset();
14552    // volatile stores with NEON intrinsics not supported
14553    Info.flags = MachineMemOperand::MOStore;
14554    return true;
14555  }
14556  case Intrinsic::aarch64_neon_st2lane:
14557  case Intrinsic::aarch64_neon_st3lane:
14558  case Intrinsic::aarch64_neon_st4lane: {
14559    Info.opc = ISD::INTRINSIC_VOID;
14560    unsigned NumElts = 0;
14561    // all the vector type is same
14562    Type *VecTy = I.getArgOperand(0)->getType();
14563    MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
14564
14565    for (const Value *Arg : I.args()) {
14566      Type *ArgTy = Arg->getType();
14567      if (!ArgTy->isVectorTy())
14568        break;
14569      NumElts += 1;
14570    }
14571
14572    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
14573    Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
14574    Info.offset = 0;
14575    Info.align.reset();
14576    // volatile stores with NEON intrinsics not supported
14577    Info.flags = MachineMemOperand::MOStore;
14578    return true;
14579  }
14580  case Intrinsic::aarch64_ldaxr:
14581  case Intrinsic::aarch64_ldxr: {
14582    Type *ValTy = I.getParamElementType(0);
14583    Info.opc = ISD::INTRINSIC_W_CHAIN;
14584    Info.memVT = MVT::getVT(ValTy);
14585    Info.ptrVal = I.getArgOperand(0);
14586    Info.offset = 0;
14587    Info.align = DL.getABITypeAlign(ValTy);
14588    Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
14589    return true;
14590  }
14591  case Intrinsic::aarch64_stlxr:
14592  case Intrinsic::aarch64_stxr: {
14593    Type *ValTy = I.getParamElementType(1);
14594    Info.opc = ISD::INTRINSIC_W_CHAIN;
14595    Info.memVT = MVT::getVT(ValTy);
14596    Info.ptrVal = I.getArgOperand(1);
14597    Info.offset = 0;
14598    Info.align = DL.getABITypeAlign(ValTy);
14599    Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
14600    return true;
14601  }
14602  case Intrinsic::aarch64_ldaxp:
14603  case Intrinsic::aarch64_ldxp:
14604    Info.opc = ISD::INTRINSIC_W_CHAIN;
14605    Info.memVT = MVT::i128;
14606    Info.ptrVal = I.getArgOperand(0);
14607    Info.offset = 0;
14608    Info.align = Align(16);
14609    Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
14610    return true;
14611  case Intrinsic::aarch64_stlxp:
14612  case Intrinsic::aarch64_stxp:
14613    Info.opc = ISD::INTRINSIC_W_CHAIN;
14614    Info.memVT = MVT::i128;
14615    Info.ptrVal = I.getArgOperand(2);
14616    Info.offset = 0;
14617    Info.align = Align(16);
14618    Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
14619    return true;
14620  case Intrinsic::aarch64_sve_ldnt1: {
14621    Type *ElTy = cast<VectorType>(I.getType())->getElementType();
14622    Info.opc = ISD::INTRINSIC_W_CHAIN;
14623    Info.memVT = MVT::getVT(I.getType());
14624    Info.ptrVal = I.getArgOperand(1);
14625    Info.offset = 0;
14626    Info.align = DL.getABITypeAlign(ElTy);
14627    Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MONonTemporal;
14628    return true;
14629  }
14630  case Intrinsic::aarch64_sve_stnt1: {
14631    Type *ElTy =
14632        cast<VectorType>(I.getArgOperand(0)->getType())->getElementType();
14633    Info.opc = ISD::INTRINSIC_W_CHAIN;
14634    Info.memVT = MVT::getVT(I.getOperand(0)->getType());
14635    Info.ptrVal = I.getArgOperand(2);
14636    Info.offset = 0;
14637    Info.align = DL.getABITypeAlign(ElTy);
14638    Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MONonTemporal;
14639    return true;
14640  }
14641  case Intrinsic::aarch64_mops_memset_tag: {
14642    Value *Dst = I.getArgOperand(0);
14643    Value *Val = I.getArgOperand(1);
14644    Info.opc = ISD::INTRINSIC_W_CHAIN;
14645    Info.memVT = MVT::getVT(Val->getType());
14646    Info.ptrVal = Dst;
14647    Info.offset = 0;
14648    Info.align = I.getParamAlign(0).valueOrOne();
14649    Info.flags = MachineMemOperand::MOStore;
14650    // The size of the memory being operated on is unknown at this point
14651    Info.size = MemoryLocation::UnknownSize;
14652    return true;
14653  }
14654  default:
14655    break;
14656  }
14657
14658  return false;
14659}
14660
14661bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load,
14662                                                  ISD::LoadExtType ExtTy,
14663                                                  EVT NewVT) const {
14664  // TODO: This may be worth removing. Check regression tests for diffs.
14665  if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
14666    return false;
14667
14668  // If we're reducing the load width in order to avoid having to use an extra
14669  // instruction to do extension then it's probably a good idea.
14670  if (ExtTy != ISD::NON_EXTLOAD)
14671    return true;
14672  // Don't reduce load width if it would prevent us from combining a shift into
14673  // the offset.
14674  MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
14675  assert(Mem);
14676  const SDValue &Base = Mem->getBasePtr();
14677  if (Base.getOpcode() == ISD::ADD &&
14678      Base.getOperand(1).getOpcode() == ISD::SHL &&
14679      Base.getOperand(1).hasOneUse() &&
14680      Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
14681    // It's unknown whether a scalable vector has a power-of-2 bitwidth.
14682    if (Mem->getMemoryVT().isScalableVector())
14683      return false;
14684    // The shift can be combined if it matches the size of the value being
14685    // loaded (and so reducing the width would make it not match).
14686    uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
14687    uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
14688    if (ShiftAmount == Log2_32(LoadBytes))
14689      return false;
14690  }
14691  // We have no reason to disallow reducing the load width, so allow it.
14692  return true;
14693}
14694
14695// Treat a sext_inreg(extract(..)) as free if it has multiple uses.
14696bool AArch64TargetLowering::shouldRemoveRedundantExtend(SDValue Extend) const {
14697  EVT VT = Extend.getValueType();
14698  if ((VT == MVT::i64 || VT == MVT::i32) && Extend->use_size()) {
14699    SDValue Extract = Extend.getOperand(0);
14700    if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse())
14701      Extract = Extract.getOperand(0);
14702    if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) {
14703      EVT VecVT = Extract.getOperand(0).getValueType();
14704      if (VecVT.getScalarType() == MVT::i8 || VecVT.getScalarType() == MVT::i16)
14705        return false;
14706    }
14707  }
14708  return true;
14709}
14710
14711// Truncations from 64-bit GPR to 32-bit GPR is free.
14712bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
14713  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
14714    return false;
14715  uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue();
14716  uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue();
14717  return NumBits1 > NumBits2;
14718}
14719bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
14720  if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
14721    return false;
14722  uint64_t NumBits1 = VT1.getFixedSizeInBits();
14723  uint64_t NumBits2 = VT2.getFixedSizeInBits();
14724  return NumBits1 > NumBits2;
14725}
14726
14727/// Check if it is profitable to hoist instruction in then/else to if.
14728/// Not profitable if I and it's user can form a FMA instruction
14729/// because we prefer FMSUB/FMADD.
14730bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
14731  if (I->getOpcode() != Instruction::FMul)
14732    return true;
14733
14734  if (!I->hasOneUse())
14735    return true;
14736
14737  Instruction *User = I->user_back();
14738
14739  if (!(User->getOpcode() == Instruction::FSub ||
14740        User->getOpcode() == Instruction::FAdd))
14741    return true;
14742
14743  const TargetOptions &Options = getTargetMachine().Options;
14744  const Function *F = I->getFunction();
14745  const DataLayout &DL = F->getParent()->getDataLayout();
14746  Type *Ty = User->getOperand(0)->getType();
14747
14748  return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
14749           isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
14750           (Options.AllowFPOpFusion == FPOpFusion::Fast ||
14751            Options.UnsafeFPMath));
14752}
14753
14754// All 32-bit GPR operations implicitly zero the high-half of the corresponding
14755// 64-bit GPR.
14756bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
14757  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
14758    return false;
14759  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
14760  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
14761  return NumBits1 == 32 && NumBits2 == 64;
14762}
14763bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
14764  if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
14765    return false;
14766  unsigned NumBits1 = VT1.getSizeInBits();
14767  unsigned NumBits2 = VT2.getSizeInBits();
14768  return NumBits1 == 32 && NumBits2 == 64;
14769}
14770
14771bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
14772  EVT VT1 = Val.getValueType();
14773  if (isZExtFree(VT1, VT2)) {
14774    return true;
14775  }
14776
14777  if (Val.getOpcode() != ISD::LOAD)
14778    return false;
14779
14780  // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
14781  return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
14782          VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
14783          VT1.getSizeInBits() <= 32);
14784}
14785
14786bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
14787  if (isa<FPExtInst>(Ext))
14788    return false;
14789
14790  // Vector types are not free.
14791  if (Ext->getType()->isVectorTy())
14792    return false;
14793
14794  for (const Use &U : Ext->uses()) {
14795    // The extension is free if we can fold it with a left shift in an
14796    // addressing mode or an arithmetic operation: add, sub, and cmp.
14797
14798    // Is there a shift?
14799    const Instruction *Instr = cast<Instruction>(U.getUser());
14800
14801    // Is this a constant shift?
14802    switch (Instr->getOpcode()) {
14803    case Instruction::Shl:
14804      if (!isa<ConstantInt>(Instr->getOperand(1)))
14805        return false;
14806      break;
14807    case Instruction::GetElementPtr: {
14808      gep_type_iterator GTI = gep_type_begin(Instr);
14809      auto &DL = Ext->getModule()->getDataLayout();
14810      std::advance(GTI, U.getOperandNo()-1);
14811      Type *IdxTy = GTI.getIndexedType();
14812      // This extension will end up with a shift because of the scaling factor.
14813      // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
14814      // Get the shift amount based on the scaling factor:
14815      // log2(sizeof(IdxTy)) - log2(8).
14816      if (IdxTy->isScalableTy())
14817        return false;
14818      uint64_t ShiftAmt =
14819          llvm::countr_zero(DL.getTypeStoreSizeInBits(IdxTy).getFixedValue()) -
14820          3;
14821      // Is the constant foldable in the shift of the addressing mode?
14822      // I.e., shift amount is between 1 and 4 inclusive.
14823      if (ShiftAmt == 0 || ShiftAmt > 4)
14824        return false;
14825      break;
14826    }
14827    case Instruction::Trunc:
14828      // Check if this is a noop.
14829      // trunc(sext ty1 to ty2) to ty1.
14830      if (Instr->getType() == Ext->getOperand(0)->getType())
14831        continue;
14832      [[fallthrough]];
14833    default:
14834      return false;
14835    }
14836
14837    // At this point we can use the bfm family, so this extension is free
14838    // for that use.
14839  }
14840  return true;
14841}
14842
14843static bool isSplatShuffle(Value *V) {
14844  if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
14845    return all_equal(Shuf->getShuffleMask());
14846  return false;
14847}
14848
14849/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
14850/// or upper half of the vector elements.
14851static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
14852                                     bool AllowSplat = false) {
14853  auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
14854    auto *FullTy = FullV->getType();
14855    auto *HalfTy = HalfV->getType();
14856    return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
14857           2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
14858  };
14859
14860  auto extractHalf = [](Value *FullV, Value *HalfV) {
14861    auto *FullVT = cast<FixedVectorType>(FullV->getType());
14862    auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
14863    return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
14864  };
14865
14866  ArrayRef<int> M1, M2;
14867  Value *S1Op1 = nullptr, *S2Op1 = nullptr;
14868  if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
14869      !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
14870    return false;
14871
14872  // If we allow splats, set S1Op1/S2Op1 to nullptr for the relavant arg so that
14873  // it is not checked as an extract below.
14874  if (AllowSplat && isSplatShuffle(Op1))
14875    S1Op1 = nullptr;
14876  if (AllowSplat && isSplatShuffle(Op2))
14877    S2Op1 = nullptr;
14878
14879  // Check that the operands are half as wide as the result and we extract
14880  // half of the elements of the input vectors.
14881  if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
14882      (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
14883    return false;
14884
14885  // Check the mask extracts either the lower or upper half of vector
14886  // elements.
14887  int M1Start = 0;
14888  int M2Start = 0;
14889  int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
14890  if ((S1Op1 &&
14891       !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
14892      (S2Op1 &&
14893       !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
14894    return false;
14895
14896  if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
14897      (M2Start != 0 && M2Start != (NumElements / 2)))
14898    return false;
14899  if (S1Op1 && S2Op1 && M1Start != M2Start)
14900    return false;
14901
14902  return true;
14903}
14904
14905/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
14906/// of the vector elements.
14907static bool areExtractExts(Value *Ext1, Value *Ext2) {
14908  auto areExtDoubled = [](Instruction *Ext) {
14909    return Ext->getType()->getScalarSizeInBits() ==
14910           2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
14911  };
14912
14913  if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
14914      !match(Ext2, m_ZExtOrSExt(m_Value())) ||
14915      !areExtDoubled(cast<Instruction>(Ext1)) ||
14916      !areExtDoubled(cast<Instruction>(Ext2)))
14917    return false;
14918
14919  return true;
14920}
14921
14922/// Check if Op could be used with vmull_high_p64 intrinsic.
14923static bool isOperandOfVmullHighP64(Value *Op) {
14924  Value *VectorOperand = nullptr;
14925  ConstantInt *ElementIndex = nullptr;
14926  return match(Op, m_ExtractElt(m_Value(VectorOperand),
14927                                m_ConstantInt(ElementIndex))) &&
14928         ElementIndex->getValue() == 1 &&
14929         isa<FixedVectorType>(VectorOperand->getType()) &&
14930         cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
14931}
14932
14933/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
14934static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
14935  return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2);
14936}
14937
14938static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl<Use *> &Ops) {
14939  // Restrict ourselves to the form CodeGenPrepare typically constructs.
14940  auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
14941  if (!GEP || GEP->getNumOperands() != 2)
14942    return false;
14943
14944  Value *Base = GEP->getOperand(0);
14945  Value *Offsets = GEP->getOperand(1);
14946
14947  // We only care about scalar_base+vector_offsets.
14948  if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
14949    return false;
14950
14951  // Sink extends that would allow us to use 32-bit offset vectors.
14952  if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
14953    auto *OffsetsInst = cast<Instruction>(Offsets);
14954    if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
14955        OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
14956      Ops.push_back(&GEP->getOperandUse(1));
14957  }
14958
14959  // Sink the GEP.
14960  return true;
14961}
14962
14963/// We want to sink following cases:
14964/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale
14965static bool shouldSinkVScale(Value *Op, SmallVectorImpl<Use *> &Ops) {
14966  if (match(Op, m_VScale()))
14967    return true;
14968  if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
14969      match(Op, m_Mul(m_VScale(), m_ConstantInt()))) {
14970    Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
14971    return true;
14972  }
14973  return false;
14974}
14975
14976/// Check if sinking \p I's operands to I's basic block is profitable, because
14977/// the operands can be folded into a target instruction, e.g.
14978/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
14979bool AArch64TargetLowering::shouldSinkOperands(
14980    Instruction *I, SmallVectorImpl<Use *> &Ops) const {
14981  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
14982    switch (II->getIntrinsicID()) {
14983    case Intrinsic::aarch64_neon_smull:
14984    case Intrinsic::aarch64_neon_umull:
14985      if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
14986                                   /*AllowSplat=*/true)) {
14987        Ops.push_back(&II->getOperandUse(0));
14988        Ops.push_back(&II->getOperandUse(1));
14989        return true;
14990      }
14991      [[fallthrough]];
14992
14993    case Intrinsic::fma:
14994      if (isa<VectorType>(I->getType()) &&
14995          cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
14996          !Subtarget->hasFullFP16())
14997        return false;
14998      [[fallthrough]];
14999    case Intrinsic::aarch64_neon_sqdmull:
15000    case Intrinsic::aarch64_neon_sqdmulh:
15001    case Intrinsic::aarch64_neon_sqrdmulh:
15002      // Sink splats for index lane variants
15003      if (isSplatShuffle(II->getOperand(0)))
15004        Ops.push_back(&II->getOperandUse(0));
15005      if (isSplatShuffle(II->getOperand(1)))
15006        Ops.push_back(&II->getOperandUse(1));
15007      return !Ops.empty();
15008    case Intrinsic::aarch64_neon_fmlal:
15009    case Intrinsic::aarch64_neon_fmlal2:
15010    case Intrinsic::aarch64_neon_fmlsl:
15011    case Intrinsic::aarch64_neon_fmlsl2:
15012      // Sink splats for index lane variants
15013      if (isSplatShuffle(II->getOperand(1)))
15014        Ops.push_back(&II->getOperandUse(1));
15015      if (isSplatShuffle(II->getOperand(2)))
15016        Ops.push_back(&II->getOperandUse(2));
15017      return !Ops.empty();
15018    case Intrinsic::aarch64_sve_ptest_first:
15019    case Intrinsic::aarch64_sve_ptest_last:
15020      if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
15021        if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
15022          Ops.push_back(&II->getOperandUse(0));
15023      return !Ops.empty();
15024    case Intrinsic::aarch64_sme_write_horiz:
15025    case Intrinsic::aarch64_sme_write_vert:
15026    case Intrinsic::aarch64_sme_writeq_horiz:
15027    case Intrinsic::aarch64_sme_writeq_vert: {
15028      auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
15029      if (!Idx || Idx->getOpcode() != Instruction::Add)
15030        return false;
15031      Ops.push_back(&II->getOperandUse(1));
15032      return true;
15033    }
15034    case Intrinsic::aarch64_sme_read_horiz:
15035    case Intrinsic::aarch64_sme_read_vert:
15036    case Intrinsic::aarch64_sme_readq_horiz:
15037    case Intrinsic::aarch64_sme_readq_vert:
15038    case Intrinsic::aarch64_sme_ld1b_vert:
15039    case Intrinsic::aarch64_sme_ld1h_vert:
15040    case Intrinsic::aarch64_sme_ld1w_vert:
15041    case Intrinsic::aarch64_sme_ld1d_vert:
15042    case Intrinsic::aarch64_sme_ld1q_vert:
15043    case Intrinsic::aarch64_sme_st1b_vert:
15044    case Intrinsic::aarch64_sme_st1h_vert:
15045    case Intrinsic::aarch64_sme_st1w_vert:
15046    case Intrinsic::aarch64_sme_st1d_vert:
15047    case Intrinsic::aarch64_sme_st1q_vert:
15048    case Intrinsic::aarch64_sme_ld1b_horiz:
15049    case Intrinsic::aarch64_sme_ld1h_horiz:
15050    case Intrinsic::aarch64_sme_ld1w_horiz:
15051    case Intrinsic::aarch64_sme_ld1d_horiz:
15052    case Intrinsic::aarch64_sme_ld1q_horiz:
15053    case Intrinsic::aarch64_sme_st1b_horiz:
15054    case Intrinsic::aarch64_sme_st1h_horiz:
15055    case Intrinsic::aarch64_sme_st1w_horiz:
15056    case Intrinsic::aarch64_sme_st1d_horiz:
15057    case Intrinsic::aarch64_sme_st1q_horiz: {
15058      auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
15059      if (!Idx || Idx->getOpcode() != Instruction::Add)
15060        return false;
15061      Ops.push_back(&II->getOperandUse(3));
15062      return true;
15063    }
15064    case Intrinsic::aarch64_neon_pmull:
15065      if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
15066        return false;
15067      Ops.push_back(&II->getOperandUse(0));
15068      Ops.push_back(&II->getOperandUse(1));
15069      return true;
15070    case Intrinsic::aarch64_neon_pmull64:
15071      if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
15072                                     II->getArgOperand(1)))
15073        return false;
15074      Ops.push_back(&II->getArgOperandUse(0));
15075      Ops.push_back(&II->getArgOperandUse(1));
15076      return true;
15077    case Intrinsic::masked_gather:
15078      if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
15079        return false;
15080      Ops.push_back(&II->getArgOperandUse(0));
15081      return true;
15082    case Intrinsic::masked_scatter:
15083      if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
15084        return false;
15085      Ops.push_back(&II->getArgOperandUse(1));
15086      return true;
15087    default:
15088      return false;
15089    }
15090  }
15091
15092  // Sink vscales closer to uses for better isel
15093  switch (I->getOpcode()) {
15094  case Instruction::GetElementPtr:
15095  case Instruction::Add:
15096  case Instruction::Sub:
15097    for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
15098      if (shouldSinkVScale(I->getOperand(Op), Ops)) {
15099        Ops.push_back(&I->getOperandUse(Op));
15100        return true;
15101      }
15102    }
15103    break;
15104  default:
15105    break;
15106  }
15107
15108  if (!I->getType()->isVectorTy())
15109    return false;
15110
15111  switch (I->getOpcode()) {
15112  case Instruction::Sub:
15113  case Instruction::Add: {
15114    if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
15115      return false;
15116
15117    // If the exts' operands extract either the lower or upper elements, we
15118    // can sink them too.
15119    auto Ext1 = cast<Instruction>(I->getOperand(0));
15120    auto Ext2 = cast<Instruction>(I->getOperand(1));
15121    if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
15122      Ops.push_back(&Ext1->getOperandUse(0));
15123      Ops.push_back(&Ext2->getOperandUse(0));
15124    }
15125
15126    Ops.push_back(&I->getOperandUse(0));
15127    Ops.push_back(&I->getOperandUse(1));
15128
15129    return true;
15130  }
15131  case Instruction::Or: {
15132    // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
15133    // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
15134    if (Subtarget->hasNEON()) {
15135      Instruction *OtherAnd, *IA, *IB;
15136      Value *MaskValue;
15137      // MainAnd refers to And instruction that has 'Not' as one of its operands
15138      if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
15139                          m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
15140                                           m_Instruction(IA)))))) {
15141        if (match(OtherAnd,
15142                  m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
15143          Instruction *MainAnd = I->getOperand(0) == OtherAnd
15144                                     ? cast<Instruction>(I->getOperand(1))
15145                                     : cast<Instruction>(I->getOperand(0));
15146
15147          // Both Ands should be in same basic block as Or
15148          if (I->getParent() != MainAnd->getParent() ||
15149              I->getParent() != OtherAnd->getParent())
15150            return false;
15151
15152          // Non-mask operands of both Ands should also be in same basic block
15153          if (I->getParent() != IA->getParent() ||
15154              I->getParent() != IB->getParent())
15155            return false;
15156
15157          Ops.push_back(&MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
15158          Ops.push_back(&I->getOperandUse(0));
15159          Ops.push_back(&I->getOperandUse(1));
15160
15161          return true;
15162        }
15163      }
15164    }
15165
15166    return false;
15167  }
15168  case Instruction::Mul: {
15169    int NumZExts = 0, NumSExts = 0;
15170    for (auto &Op : I->operands()) {
15171      // Make sure we are not already sinking this operand
15172      if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
15173        continue;
15174
15175      if (match(&Op, m_SExt(m_Value()))) {
15176        NumSExts++;
15177        continue;
15178      } else if (match(&Op, m_ZExt(m_Value()))) {
15179        NumZExts++;
15180        continue;
15181      }
15182
15183      ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op);
15184
15185      // If the Shuffle is a splat and the operand is a zext/sext, sinking the
15186      // operand and the s/zext can help create indexed s/umull. This is
15187      // especially useful to prevent i64 mul being scalarized.
15188      if (Shuffle && isSplatShuffle(Shuffle) &&
15189          match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
15190        Ops.push_back(&Shuffle->getOperandUse(0));
15191        Ops.push_back(&Op);
15192        if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
15193          NumSExts++;
15194        else
15195          NumZExts++;
15196        continue;
15197      }
15198
15199      if (!Shuffle)
15200        continue;
15201
15202      Value *ShuffleOperand = Shuffle->getOperand(0);
15203      InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
15204      if (!Insert)
15205        continue;
15206
15207      Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
15208      if (!OperandInstr)
15209        continue;
15210
15211      ConstantInt *ElementConstant =
15212          dyn_cast<ConstantInt>(Insert->getOperand(2));
15213      // Check that the insertelement is inserting into element 0
15214      if (!ElementConstant || !ElementConstant->isZero())
15215        continue;
15216
15217      unsigned Opcode = OperandInstr->getOpcode();
15218      if (Opcode == Instruction::SExt)
15219        NumSExts++;
15220      else if (Opcode == Instruction::ZExt)
15221        NumZExts++;
15222      else {
15223        // If we find that the top bits are known 0, then we can sink and allow
15224        // the backend to generate a umull.
15225        unsigned Bitwidth = I->getType()->getScalarSizeInBits();
15226        APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
15227        const DataLayout &DL = I->getFunction()->getParent()->getDataLayout();
15228        if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
15229          continue;
15230        NumZExts++;
15231      }
15232
15233      Ops.push_back(&Shuffle->getOperandUse(0));
15234      Ops.push_back(&Op);
15235    }
15236
15237    // Is it profitable to sink if we found two of the same type of extends.
15238    return !Ops.empty() && (NumSExts == 2 || NumZExts == 2);
15239  }
15240  default:
15241    return false;
15242  }
15243  return false;
15244}
15245
15246static bool createTblShuffleForZExt(ZExtInst *ZExt, FixedVectorType *DstTy,
15247                                    bool IsLittleEndian) {
15248  Value *Op = ZExt->getOperand(0);
15249  auto *SrcTy = cast<FixedVectorType>(Op->getType());
15250  auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
15251  auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
15252  if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth >= 64)
15253    return false;
15254
15255  assert(DstWidth % SrcWidth == 0 &&
15256         "TBL lowering is not supported for a ZExt instruction with this "
15257         "source & destination element type.");
15258  unsigned ZExtFactor = DstWidth / SrcWidth;
15259  unsigned NumElts = SrcTy->getNumElements();
15260  IRBuilder<> Builder(ZExt);
15261  SmallVector<int> Mask;
15262  // Create a mask that selects <0,...,Op[i]> for each lane of the destination
15263  // vector to replace the original ZExt. This can later be lowered to a set of
15264  // tbl instructions.
15265  for (unsigned i = 0; i < NumElts * ZExtFactor; i++) {
15266    if (IsLittleEndian) {
15267      if (i % ZExtFactor == 0)
15268        Mask.push_back(i / ZExtFactor);
15269      else
15270        Mask.push_back(NumElts);
15271    } else {
15272      if ((i + 1) % ZExtFactor == 0)
15273        Mask.push_back((i - ZExtFactor + 1) / ZExtFactor);
15274      else
15275        Mask.push_back(NumElts);
15276    }
15277  }
15278
15279  auto *FirstEltZero = Builder.CreateInsertElement(
15280      PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0));
15281  Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
15282  Result = Builder.CreateBitCast(Result, DstTy);
15283  if (DstTy != ZExt->getType())
15284    Result = Builder.CreateZExt(Result, ZExt->getType());
15285  ZExt->replaceAllUsesWith(Result);
15286  ZExt->eraseFromParent();
15287  return true;
15288}
15289
15290static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
15291  IRBuilder<> Builder(TI);
15292  SmallVector<Value *> Parts;
15293  int NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();
15294  auto *SrcTy = cast<FixedVectorType>(TI->getOperand(0)->getType());
15295  auto *DstTy = cast<FixedVectorType>(TI->getType());
15296  assert(SrcTy->getElementType()->isIntegerTy() &&
15297         "Non-integer type source vector element is not supported");
15298  assert(DstTy->getElementType()->isIntegerTy(8) &&
15299         "Unsupported destination vector element type");
15300  unsigned SrcElemTySz =
15301      cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
15302  unsigned DstElemTySz =
15303      cast<IntegerType>(DstTy->getElementType())->getBitWidth();
15304  assert((SrcElemTySz % DstElemTySz == 0) &&
15305         "Cannot lower truncate to tbl instructions for a source element size "
15306         "that is not divisible by the destination element size");
15307  unsigned TruncFactor = SrcElemTySz / DstElemTySz;
15308  assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) &&
15309         "Unsupported source vector element type size");
15310  Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16);
15311
15312  // Create a mask to choose every nth byte from the source vector table of
15313  // bytes to create the truncated destination vector, where 'n' is the truncate
15314  // ratio. For example, for a truncate from Yxi64 to Yxi8, choose
15315  // 0,8,16,..Y*8th bytes for the little-endian format
15316  SmallVector<Constant *, 16> MaskConst;
15317  for (int Itr = 0; Itr < 16; Itr++) {
15318    if (Itr < NumElements)
15319      MaskConst.push_back(Builder.getInt8(
15320          IsLittleEndian ? Itr * TruncFactor
15321                         : Itr * TruncFactor + (TruncFactor - 1)));
15322    else
15323      MaskConst.push_back(Builder.getInt8(255));
15324  }
15325
15326  int MaxTblSz = 128 * 4;
15327  int MaxSrcSz = SrcElemTySz * NumElements;
15328  int ElemsPerTbl =
15329      (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);
15330  assert(ElemsPerTbl <= 16 &&
15331         "Maximum elements selected using TBL instruction cannot exceed 16!");
15332
15333  int ShuffleCount = 128 / SrcElemTySz;
15334  SmallVector<int> ShuffleLanes;
15335  for (int i = 0; i < ShuffleCount; ++i)
15336    ShuffleLanes.push_back(i);
15337
15338  // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles
15339  // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,
15340  // call TBL & save the result in a vector of TBL results for combining later.
15341  SmallVector<Value *> Results;
15342  while (ShuffleLanes.back() < NumElements) {
15343    Parts.push_back(Builder.CreateBitCast(
15344        Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy));
15345
15346    if (Parts.size() == 4) {
15347      auto *F = Intrinsic::getDeclaration(TI->getModule(),
15348                                          Intrinsic::aarch64_neon_tbl4, VecTy);
15349      Parts.push_back(ConstantVector::get(MaskConst));
15350      Results.push_back(Builder.CreateCall(F, Parts));
15351      Parts.clear();
15352    }
15353
15354    for (int i = 0; i < ShuffleCount; ++i)
15355      ShuffleLanes[i] += ShuffleCount;
15356  }
15357
15358  assert((Parts.empty() || Results.empty()) &&
15359         "Lowering trunc for vectors requiring different TBL instructions is "
15360         "not supported!");
15361  // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD
15362  // registers
15363  if (!Parts.empty()) {
15364    Intrinsic::ID TblID;
15365    switch (Parts.size()) {
15366    case 1:
15367      TblID = Intrinsic::aarch64_neon_tbl1;
15368      break;
15369    case 2:
15370      TblID = Intrinsic::aarch64_neon_tbl2;
15371      break;
15372    case 3:
15373      TblID = Intrinsic::aarch64_neon_tbl3;
15374      break;
15375    }
15376
15377    auto *F = Intrinsic::getDeclaration(TI->getModule(), TblID, VecTy);
15378    Parts.push_back(ConstantVector::get(MaskConst));
15379    Results.push_back(Builder.CreateCall(F, Parts));
15380  }
15381
15382  // Extract the destination vector from TBL result(s) after combining them
15383  // where applicable. Currently, at most two TBLs are supported.
15384  assert(Results.size() <= 2 && "Trunc lowering does not support generation of "
15385                                "more than 2 tbl instructions!");
15386  Value *FinalResult = Results[0];
15387  if (Results.size() == 1) {
15388    if (ElemsPerTbl < 16) {
15389      SmallVector<int> FinalMask(ElemsPerTbl);
15390      std::iota(FinalMask.begin(), FinalMask.end(), 0);
15391      FinalResult = Builder.CreateShuffleVector(Results[0], FinalMask);
15392    }
15393  } else {
15394    SmallVector<int> FinalMask(ElemsPerTbl * Results.size());
15395    if (ElemsPerTbl < 16) {
15396      std::iota(FinalMask.begin(), FinalMask.begin() + ElemsPerTbl, 0);
15397      std::iota(FinalMask.begin() + ElemsPerTbl, FinalMask.end(), 16);
15398    } else {
15399      std::iota(FinalMask.begin(), FinalMask.end(), 0);
15400    }
15401    FinalResult =
15402        Builder.CreateShuffleVector(Results[0], Results[1], FinalMask);
15403  }
15404
15405  TI->replaceAllUsesWith(FinalResult);
15406  TI->eraseFromParent();
15407}
15408
15409bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(
15410    Instruction *I, Loop *L, const TargetTransformInfo &TTI) const {
15411  // shuffle_vector instructions are serialized when targeting SVE,
15412  // see LowerSPLAT_VECTOR. This peephole is not beneficial.
15413  if (!EnableExtToTBL || Subtarget->useSVEForFixedLengthVectors())
15414    return false;
15415
15416  // Try to optimize conversions using tbl. This requires materializing constant
15417  // index vectors, which can increase code size and add loads. Skip the
15418  // transform unless the conversion is in a loop block guaranteed to execute
15419  // and we are not optimizing for size.
15420  Function *F = I->getParent()->getParent();
15421  if (!L || L->getHeader() != I->getParent() || F->hasMinSize() ||
15422      F->hasOptSize())
15423    return false;
15424
15425  auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
15426  auto *DstTy = dyn_cast<FixedVectorType>(I->getType());
15427  if (!SrcTy || !DstTy)
15428    return false;
15429
15430  // Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be
15431  // lowered to tbl instructions to insert the original i8 elements
15432  // into i8x lanes. This is enabled for cases where it is beneficial.
15433  auto *ZExt = dyn_cast<ZExtInst>(I);
15434  if (ZExt && SrcTy->getElementType()->isIntegerTy(8)) {
15435    auto DstWidth = DstTy->getElementType()->getScalarSizeInBits();
15436    if (DstWidth % 8 != 0)
15437      return false;
15438
15439    auto *TruncDstType =
15440        cast<FixedVectorType>(VectorType::getTruncatedElementVectorType(DstTy));
15441    // If the ZExt can be lowered to a single ZExt to the next power-of-2 and
15442    // the remaining ZExt folded into the user, don't use tbl lowering.
15443    auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits();
15444    if (TTI.getCastInstrCost(I->getOpcode(), DstTy, TruncDstType,
15445                             TargetTransformInfo::getCastContextHint(I),
15446                             TTI::TCK_SizeAndLatency, I) == TTI::TCC_Free) {
15447      if (SrcWidth * 2 >= TruncDstType->getElementType()->getScalarSizeInBits())
15448        return false;
15449
15450      DstTy = TruncDstType;
15451    }
15452
15453    return createTblShuffleForZExt(ZExt, DstTy, Subtarget->isLittleEndian());
15454  }
15455
15456  auto *UIToFP = dyn_cast<UIToFPInst>(I);
15457  if (UIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
15458      DstTy->getElementType()->isFloatTy()) {
15459    IRBuilder<> Builder(I);
15460    auto *ZExt = cast<ZExtInst>(
15461        Builder.CreateZExt(I->getOperand(0), VectorType::getInteger(DstTy)));
15462    auto *UI = Builder.CreateUIToFP(ZExt, DstTy);
15463    I->replaceAllUsesWith(UI);
15464    I->eraseFromParent();
15465    return createTblShuffleForZExt(ZExt, cast<FixedVectorType>(ZExt->getType()),
15466                                   Subtarget->isLittleEndian());
15467  }
15468
15469  // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
15470  // followed by a truncate lowered to using tbl.4.
15471  auto *FPToUI = dyn_cast<FPToUIInst>(I);
15472  if (FPToUI &&
15473      (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
15474      SrcTy->getElementType()->isFloatTy() &&
15475      DstTy->getElementType()->isIntegerTy(8)) {
15476    IRBuilder<> Builder(I);
15477    auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0),
15478                                          VectorType::getInteger(SrcTy));
15479    auto *TruncI = Builder.CreateTrunc(WideConv, DstTy);
15480    I->replaceAllUsesWith(TruncI);
15481    I->eraseFromParent();
15482    createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian());
15483    return true;
15484  }
15485
15486  // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate
15487  // tbl instruction selecting the lowest/highest (little/big endian) 8 bits
15488  // per lane of the input that is represented using 1,2,3 or 4 128-bit table
15489  // registers
15490  auto *TI = dyn_cast<TruncInst>(I);
15491  if (TI && DstTy->getElementType()->isIntegerTy(8) &&
15492      ((SrcTy->getElementType()->isIntegerTy(32) ||
15493        SrcTy->getElementType()->isIntegerTy(64)) &&
15494       (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) {
15495    createTblForTrunc(TI, Subtarget->isLittleEndian());
15496    return true;
15497  }
15498
15499  return false;
15500}
15501
15502bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
15503                                          Align &RequiredAligment) const {
15504  if (!LoadedType.isSimple() ||
15505      (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
15506    return false;
15507  // Cyclone supports unaligned accesses.
15508  RequiredAligment = Align(1);
15509  unsigned NumBits = LoadedType.getSizeInBits();
15510  return NumBits == 32 || NumBits == 64;
15511}
15512
15513/// A helper function for determining the number of interleaved accesses we
15514/// will generate when lowering accesses of the given type.
15515unsigned AArch64TargetLowering::getNumInterleavedAccesses(
15516    VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
15517  unsigned VecSize = 128;
15518  unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
15519  unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
15520  if (UseScalable)
15521    VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
15522  return std::max<unsigned>(1, (MinElts * ElSize + 127) / VecSize);
15523}
15524
15525MachineMemOperand::Flags
15526AArch64TargetLowering::getTargetMMOFlags(const Instruction &I) const {
15527  if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
15528      I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr)
15529    return MOStridedAccess;
15530  return MachineMemOperand::MONone;
15531}
15532
15533bool AArch64TargetLowering::isLegalInterleavedAccessType(
15534    VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
15535  unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
15536  auto EC = VecTy->getElementCount();
15537  unsigned MinElts = EC.getKnownMinValue();
15538
15539  UseScalable = false;
15540
15541  if (!VecTy->isScalableTy() && !Subtarget->hasNEON())
15542    return false;
15543
15544  if (VecTy->isScalableTy() && !Subtarget->hasSVEorSME())
15545    return false;
15546
15547  // Ensure that the predicate for this number of elements is available.
15548  if (Subtarget->hasSVE() && !getSVEPredPatternFromNumElements(MinElts))
15549    return false;
15550
15551  // Ensure the number of vector elements is greater than 1.
15552  if (MinElts < 2)
15553    return false;
15554
15555  // Ensure the element type is legal.
15556  if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
15557    return false;
15558
15559  if (EC.isScalable()) {
15560    UseScalable = true;
15561    return isPowerOf2_32(MinElts) && (MinElts * ElSize) % 128 == 0;
15562  }
15563
15564  unsigned VecSize = DL.getTypeSizeInBits(VecTy);
15565  if (!Subtarget->isNeonAvailable() ||
15566      (Subtarget->useSVEForFixedLengthVectors() &&
15567       (VecSize % Subtarget->getMinSVEVectorSizeInBits() == 0 ||
15568        (VecSize < Subtarget->getMinSVEVectorSizeInBits() &&
15569         isPowerOf2_32(MinElts) && VecSize > 128)))) {
15570    UseScalable = true;
15571    return true;
15572  }
15573
15574  // Ensure the total vector size is 64 or a multiple of 128. Types larger than
15575  // 128 will be split into multiple interleaved accesses.
15576  return VecSize == 64 || VecSize % 128 == 0;
15577}
15578
15579static ScalableVectorType *getSVEContainerIRType(FixedVectorType *VTy) {
15580  if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext()))
15581    return ScalableVectorType::get(VTy->getElementType(), 2);
15582
15583  if (VTy->getElementType() == Type::getFloatTy(VTy->getContext()))
15584    return ScalableVectorType::get(VTy->getElementType(), 4);
15585
15586  if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext()))
15587    return ScalableVectorType::get(VTy->getElementType(), 8);
15588
15589  if (VTy->getElementType() == Type::getHalfTy(VTy->getContext()))
15590    return ScalableVectorType::get(VTy->getElementType(), 8);
15591
15592  if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext()))
15593    return ScalableVectorType::get(VTy->getElementType(), 2);
15594
15595  if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext()))
15596    return ScalableVectorType::get(VTy->getElementType(), 4);
15597
15598  if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext()))
15599    return ScalableVectorType::get(VTy->getElementType(), 8);
15600
15601  if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext()))
15602    return ScalableVectorType::get(VTy->getElementType(), 16);
15603
15604  llvm_unreachable("Cannot handle input vector type");
15605}
15606
15607static Function *getStructuredLoadFunction(Module *M, unsigned Factor,
15608                                           bool Scalable, Type *LDVTy,
15609                                           Type *PtrTy) {
15610  assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
15611  static const Intrinsic::ID SVELoads[3] = {Intrinsic::aarch64_sve_ld2_sret,
15612                                            Intrinsic::aarch64_sve_ld3_sret,
15613                                            Intrinsic::aarch64_sve_ld4_sret};
15614  static const Intrinsic::ID NEONLoads[3] = {Intrinsic::aarch64_neon_ld2,
15615                                             Intrinsic::aarch64_neon_ld3,
15616                                             Intrinsic::aarch64_neon_ld4};
15617  if (Scalable)
15618    return Intrinsic::getDeclaration(M, SVELoads[Factor - 2], {LDVTy});
15619
15620  return Intrinsic::getDeclaration(M, NEONLoads[Factor - 2], {LDVTy, PtrTy});
15621}
15622
15623static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
15624                                            bool Scalable, Type *STVTy,
15625                                            Type *PtrTy) {
15626  assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
15627  static const Intrinsic::ID SVEStores[3] = {Intrinsic::aarch64_sve_st2,
15628                                             Intrinsic::aarch64_sve_st3,
15629                                             Intrinsic::aarch64_sve_st4};
15630  static const Intrinsic::ID NEONStores[3] = {Intrinsic::aarch64_neon_st2,
15631                                              Intrinsic::aarch64_neon_st3,
15632                                              Intrinsic::aarch64_neon_st4};
15633  if (Scalable)
15634    return Intrinsic::getDeclaration(M, SVEStores[Factor - 2], {STVTy});
15635
15636  return Intrinsic::getDeclaration(M, NEONStores[Factor - 2], {STVTy, PtrTy});
15637}
15638
15639/// Lower an interleaved load into a ldN intrinsic.
15640///
15641/// E.g. Lower an interleaved load (Factor = 2):
15642///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
15643///        %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>  ; Extract even elements
15644///        %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>  ; Extract odd elements
15645///
15646///      Into:
15647///        %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
15648///        %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
15649///        %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
15650bool AArch64TargetLowering::lowerInterleavedLoad(
15651    LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
15652    ArrayRef<unsigned> Indices, unsigned Factor) const {
15653  assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
15654         "Invalid interleave factor");
15655  assert(!Shuffles.empty() && "Empty shufflevector input");
15656  assert(Shuffles.size() == Indices.size() &&
15657         "Unmatched number of shufflevectors and indices");
15658
15659  const DataLayout &DL = LI->getModule()->getDataLayout();
15660
15661  VectorType *VTy = Shuffles[0]->getType();
15662
15663  // Skip if we do not have NEON and skip illegal vector types. We can
15664  // "legalize" wide vector types into multiple interleaved accesses as long as
15665  // the vector types are divisible by 128.
15666  bool UseScalable;
15667  if (!Subtarget->hasNEON() ||
15668      !isLegalInterleavedAccessType(VTy, DL, UseScalable))
15669    return false;
15670
15671  unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
15672
15673  auto *FVTy = cast<FixedVectorType>(VTy);
15674
15675  // A pointer vector can not be the return type of the ldN intrinsics. Need to
15676  // load integer vectors first and then convert to pointer vectors.
15677  Type *EltTy = FVTy->getElementType();
15678  if (EltTy->isPointerTy())
15679    FVTy =
15680        FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
15681
15682  // If we're going to generate more than one load, reset the sub-vector type
15683  // to something legal.
15684  FVTy = FixedVectorType::get(FVTy->getElementType(),
15685                              FVTy->getNumElements() / NumLoads);
15686
15687  auto *LDVTy =
15688      UseScalable ? cast<VectorType>(getSVEContainerIRType(FVTy)) : FVTy;
15689
15690  IRBuilder<> Builder(LI);
15691
15692  // The base address of the load.
15693  Value *BaseAddr = LI->getPointerOperand();
15694
15695  Type *PtrTy = LI->getPointerOperandType();
15696  Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()),
15697                                 LDVTy->getElementCount());
15698
15699  Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
15700                                                UseScalable, LDVTy, PtrTy);
15701
15702  // Holds sub-vectors extracted from the load intrinsic return values. The
15703  // sub-vectors are associated with the shufflevector instructions they will
15704  // replace.
15705  DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs;
15706
15707  Value *PTrue = nullptr;
15708  if (UseScalable) {
15709    std::optional<unsigned> PgPattern =
15710        getSVEPredPatternFromNumElements(FVTy->getNumElements());
15711    if (Subtarget->getMinSVEVectorSizeInBits() ==
15712            Subtarget->getMaxSVEVectorSizeInBits() &&
15713        Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy))
15714      PgPattern = AArch64SVEPredPattern::all;
15715
15716    auto *PTruePat =
15717        ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern);
15718    PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
15719                                    {PTruePat});
15720  }
15721
15722  for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
15723
15724    // If we're generating more than one load, compute the base address of
15725    // subsequent loads as an offset from the previous.
15726    if (LoadCount > 0)
15727      BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr,
15728                                            FVTy->getNumElements() * Factor);
15729
15730    CallInst *LdN;
15731    if (UseScalable)
15732      LdN = Builder.CreateCall(LdNFunc, {PTrue, BaseAddr}, "ldN");
15733    else
15734      LdN = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
15735
15736    // Extract and store the sub-vectors returned by the load intrinsic.
15737    for (unsigned i = 0; i < Shuffles.size(); i++) {
15738      ShuffleVectorInst *SVI = Shuffles[i];
15739      unsigned Index = Indices[i];
15740
15741      Value *SubVec = Builder.CreateExtractValue(LdN, Index);
15742
15743      if (UseScalable)
15744        SubVec = Builder.CreateExtractVector(
15745            FVTy, SubVec,
15746            ConstantInt::get(Type::getInt64Ty(VTy->getContext()), 0));
15747
15748      // Convert the integer vector to pointer vector if the element is pointer.
15749      if (EltTy->isPointerTy())
15750        SubVec = Builder.CreateIntToPtr(
15751            SubVec, FixedVectorType::get(SVI->getType()->getElementType(),
15752                                         FVTy->getNumElements()));
15753
15754      SubVecs[SVI].push_back(SubVec);
15755    }
15756  }
15757
15758  // Replace uses of the shufflevector instructions with the sub-vectors
15759  // returned by the load intrinsic. If a shufflevector instruction is
15760  // associated with more than one sub-vector, those sub-vectors will be
15761  // concatenated into a single wide vector.
15762  for (ShuffleVectorInst *SVI : Shuffles) {
15763    auto &SubVec = SubVecs[SVI];
15764    auto *WideVec =
15765        SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
15766    SVI->replaceAllUsesWith(WideVec);
15767  }
15768
15769  return true;
15770}
15771
15772template <typename Iter>
15773bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
15774  int MaxLookupDist = 20;
15775  unsigned IdxWidth = DL.getIndexSizeInBits(0);
15776  APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
15777  const Value *PtrA1 =
15778      Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
15779
15780  while (++It != End) {
15781    if (It->isDebugOrPseudoInst())
15782      continue;
15783    if (MaxLookupDist-- == 0)
15784      break;
15785    if (const auto *SI = dyn_cast<StoreInst>(&*It)) {
15786      const Value *PtrB1 =
15787          SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
15788              DL, OffsetB);
15789      if (PtrA1 == PtrB1 &&
15790          (OffsetA.sextOrTrunc(IdxWidth) - OffsetB.sextOrTrunc(IdxWidth))
15791                  .abs() == 16)
15792        return true;
15793    }
15794  }
15795
15796  return false;
15797}
15798
15799/// Lower an interleaved store into a stN intrinsic.
15800///
15801/// E.g. Lower an interleaved store (Factor = 3):
15802///        %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
15803///                 <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
15804///        store <12 x i32> %i.vec, <12 x i32>* %ptr
15805///
15806///      Into:
15807///        %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
15808///        %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
15809///        %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
15810///        call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
15811///
15812/// Note that the new shufflevectors will be removed and we'll only generate one
15813/// st3 instruction in CodeGen.
15814///
15815/// Example for a more general valid mask (Factor 3). Lower:
15816///        %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
15817///                 <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
15818///        store <12 x i32> %i.vec, <12 x i32>* %ptr
15819///
15820///      Into:
15821///        %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
15822///        %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
15823///        %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
15824///        call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
15825bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
15826                                                  ShuffleVectorInst *SVI,
15827                                                  unsigned Factor) const {
15828
15829  assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
15830         "Invalid interleave factor");
15831
15832  auto *VecTy = cast<FixedVectorType>(SVI->getType());
15833  assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
15834
15835  unsigned LaneLen = VecTy->getNumElements() / Factor;
15836  Type *EltTy = VecTy->getElementType();
15837  auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
15838
15839  const DataLayout &DL = SI->getModule()->getDataLayout();
15840  bool UseScalable;
15841
15842  // Skip if we do not have NEON and skip illegal vector types. We can
15843  // "legalize" wide vector types into multiple interleaved accesses as long as
15844  // the vector types are divisible by 128.
15845  if (!Subtarget->hasNEON() ||
15846      !isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
15847    return false;
15848
15849  unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
15850
15851  Value *Op0 = SVI->getOperand(0);
15852  Value *Op1 = SVI->getOperand(1);
15853  IRBuilder<> Builder(SI);
15854
15855  // StN intrinsics don't support pointer vectors as arguments. Convert pointer
15856  // vectors to integer vectors.
15857  if (EltTy->isPointerTy()) {
15858    Type *IntTy = DL.getIntPtrType(EltTy);
15859    unsigned NumOpElts =
15860        cast<FixedVectorType>(Op0->getType())->getNumElements();
15861
15862    // Convert to the corresponding integer vector.
15863    auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
15864    Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
15865    Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
15866
15867    SubVecTy = FixedVectorType::get(IntTy, LaneLen);
15868  }
15869
15870  // If we're going to generate more than one store, reset the lane length
15871  // and sub-vector type to something legal.
15872  LaneLen /= NumStores;
15873  SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
15874
15875  auto *STVTy = UseScalable ? cast<VectorType>(getSVEContainerIRType(SubVecTy))
15876                            : SubVecTy;
15877
15878  // The base address of the store.
15879  Value *BaseAddr = SI->getPointerOperand();
15880
15881  auto Mask = SVI->getShuffleMask();
15882
15883  // Sanity check if all the indices are NOT in range.
15884  // If mask is `poison`, `Mask` may be a vector of -1s.
15885  // If all of them are `poison`, OOB read will happen later.
15886  if (llvm::all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
15887    return false;
15888  }
15889  // A 64bit st2 which does not start at element 0 will involved adding extra
15890  // ext elements making the st2 unprofitable, and if there is a nearby store
15891  // that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a
15892  // zip;ldp pair which has higher throughput.
15893  if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 &&
15894      (Mask[0] != 0 ||
15895       hasNearbyPairedStore(SI->getIterator(), SI->getParent()->end(), BaseAddr,
15896                            DL) ||
15897       hasNearbyPairedStore(SI->getReverseIterator(), SI->getParent()->rend(),
15898                            BaseAddr, DL)))
15899    return false;
15900
15901  Type *PtrTy = SI->getPointerOperandType();
15902  Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()),
15903                                 STVTy->getElementCount());
15904
15905  Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
15906                                                 UseScalable, STVTy, PtrTy);
15907
15908  Value *PTrue = nullptr;
15909  if (UseScalable) {
15910    std::optional<unsigned> PgPattern =
15911        getSVEPredPatternFromNumElements(SubVecTy->getNumElements());
15912    if (Subtarget->getMinSVEVectorSizeInBits() ==
15913            Subtarget->getMaxSVEVectorSizeInBits() &&
15914        Subtarget->getMinSVEVectorSizeInBits() ==
15915            DL.getTypeSizeInBits(SubVecTy))
15916      PgPattern = AArch64SVEPredPattern::all;
15917
15918    auto *PTruePat =
15919        ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern);
15920    PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
15921                                    {PTruePat});
15922  }
15923
15924  for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
15925
15926    SmallVector<Value *, 5> Ops;
15927
15928    // Split the shufflevector operands into sub vectors for the new stN call.
15929    for (unsigned i = 0; i < Factor; i++) {
15930      Value *Shuffle;
15931      unsigned IdxI = StoreCount * LaneLen * Factor + i;
15932      if (Mask[IdxI] >= 0) {
15933        Shuffle = Builder.CreateShuffleVector(
15934            Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0));
15935      } else {
15936        unsigned StartMask = 0;
15937        for (unsigned j = 1; j < LaneLen; j++) {
15938          unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
15939          if (Mask[IdxJ] >= 0) {
15940            StartMask = Mask[IdxJ] - j;
15941            break;
15942          }
15943        }
15944        // Note: Filling undef gaps with random elements is ok, since
15945        // those elements were being written anyway (with undefs).
15946        // In the case of all undefs we're defaulting to using elems from 0
15947        // Note: StartMask cannot be negative, it's checked in
15948        // isReInterleaveMask
15949        Shuffle = Builder.CreateShuffleVector(
15950            Op0, Op1, createSequentialMask(StartMask, LaneLen, 0));
15951      }
15952
15953      if (UseScalable)
15954        Shuffle = Builder.CreateInsertVector(
15955            STVTy, UndefValue::get(STVTy), Shuffle,
15956            ConstantInt::get(Type::getInt64Ty(STVTy->getContext()), 0));
15957
15958      Ops.push_back(Shuffle);
15959    }
15960
15961    if (UseScalable)
15962      Ops.push_back(PTrue);
15963
15964    // If we generating more than one store, we compute the base address of
15965    // subsequent stores as an offset from the previous.
15966    if (StoreCount > 0)
15967      BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
15968                                            BaseAddr, LaneLen * Factor);
15969
15970    Ops.push_back(BaseAddr);
15971    Builder.CreateCall(StNFunc, Ops);
15972  }
15973  return true;
15974}
15975
15976bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
15977    IntrinsicInst *DI, LoadInst *LI) const {
15978  // Only deinterleave2 supported at present.
15979  if (DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2)
15980    return false;
15981
15982  // Only a factor of 2 supported at present.
15983  const unsigned Factor = 2;
15984
15985  VectorType *VTy = cast<VectorType>(DI->getType()->getContainedType(0));
15986  const DataLayout &DL = DI->getModule()->getDataLayout();
15987  bool UseScalable;
15988  if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
15989    return false;
15990
15991  // TODO: Add support for using SVE instructions with fixed types later, using
15992  // the code from lowerInterleavedLoad to obtain the correct container type.
15993  if (UseScalable && !VTy->isScalableTy())
15994    return false;
15995
15996  unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
15997
15998  VectorType *LdTy =
15999      VectorType::get(VTy->getElementType(),
16000                      VTy->getElementCount().divideCoefficientBy(NumLoads));
16001
16002  Type *PtrTy = LI->getPointerOperandType();
16003  Function *LdNFunc = getStructuredLoadFunction(DI->getModule(), Factor,
16004                                                UseScalable, LdTy, PtrTy);
16005
16006  IRBuilder<> Builder(LI);
16007
16008  Value *Pred = nullptr;
16009  if (UseScalable)
16010    Pred =
16011        Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
16012
16013  Value *BaseAddr = LI->getPointerOperand();
16014  Value *Result;
16015  if (NumLoads > 1) {
16016    Value *Left = PoisonValue::get(VTy);
16017    Value *Right = PoisonValue::get(VTy);
16018
16019    for (unsigned I = 0; I < NumLoads; ++I) {
16020      Value *Offset = Builder.getInt64(I * Factor);
16021
16022      Value *Address = Builder.CreateGEP(LdTy, BaseAddr, {Offset});
16023      Value *LdN = nullptr;
16024      if (UseScalable)
16025        LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN");
16026      else
16027        LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
16028
16029      Value *Idx =
16030          Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
16031      Left = Builder.CreateInsertVector(
16032          VTy, Left, Builder.CreateExtractValue(LdN, 0), Idx);
16033      Right = Builder.CreateInsertVector(
16034          VTy, Right, Builder.CreateExtractValue(LdN, 1), Idx);
16035    }
16036
16037    Result = PoisonValue::get(DI->getType());
16038    Result = Builder.CreateInsertValue(Result, Left, 0);
16039    Result = Builder.CreateInsertValue(Result, Right, 1);
16040  } else {
16041    if (UseScalable)
16042      Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
16043    else
16044      Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
16045  }
16046
16047  DI->replaceAllUsesWith(Result);
16048  return true;
16049}
16050
16051bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
16052    IntrinsicInst *II, StoreInst *SI) const {
16053  // Only interleave2 supported at present.
16054  if (II->getIntrinsicID() != Intrinsic::experimental_vector_interleave2)
16055    return false;
16056
16057  // Only a factor of 2 supported at present.
16058  const unsigned Factor = 2;
16059
16060  VectorType *VTy = cast<VectorType>(II->getOperand(0)->getType());
16061  const DataLayout &DL = II->getModule()->getDataLayout();
16062  bool UseScalable;
16063  if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
16064    return false;
16065
16066  // TODO: Add support for using SVE instructions with fixed types later, using
16067  // the code from lowerInterleavedStore to obtain the correct container type.
16068  if (UseScalable && !VTy->isScalableTy())
16069    return false;
16070
16071  unsigned NumStores = getNumInterleavedAccesses(VTy, DL, UseScalable);
16072
16073  VectorType *StTy =
16074      VectorType::get(VTy->getElementType(),
16075                      VTy->getElementCount().divideCoefficientBy(NumStores));
16076
16077  Type *PtrTy = SI->getPointerOperandType();
16078  Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
16079                                                 UseScalable, StTy, PtrTy);
16080
16081  IRBuilder<> Builder(SI);
16082
16083  Value *BaseAddr = SI->getPointerOperand();
16084  Value *Pred = nullptr;
16085
16086  if (UseScalable)
16087    Pred =
16088        Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue());
16089
16090  Value *L = II->getOperand(0);
16091  Value *R = II->getOperand(1);
16092
16093  for (unsigned I = 0; I < NumStores; ++I) {
16094    Value *Address = BaseAddr;
16095    if (NumStores > 1) {
16096      Value *Offset = Builder.getInt64(I * Factor);
16097      Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
16098
16099      Value *Idx =
16100          Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
16101      L = Builder.CreateExtractVector(StTy, II->getOperand(0), Idx);
16102      R = Builder.CreateExtractVector(StTy, II->getOperand(1), Idx);
16103    }
16104
16105    if (UseScalable)
16106      Builder.CreateCall(StNFunc, {L, R, Pred, Address});
16107    else
16108      Builder.CreateCall(StNFunc, {L, R, Address});
16109  }
16110
16111  return true;
16112}
16113
16114EVT AArch64TargetLowering::getOptimalMemOpType(
16115    const MemOp &Op, const AttributeList &FuncAttributes) const {
16116  bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
16117  bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
16118  bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
16119  // Only use AdvSIMD to implement memset of 32-byte and above. It would have
16120  // taken one instruction to materialize the v2i64 zero and one store (with
16121  // restrictive addressing mode). Just do i64 stores.
16122  bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
16123  auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
16124    if (Op.isAligned(AlignCheck))
16125      return true;
16126    unsigned Fast;
16127    return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
16128                                          MachineMemOperand::MONone, &Fast) &&
16129           Fast;
16130  };
16131
16132  if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
16133      AlignmentIsAcceptable(MVT::v16i8, Align(16)))
16134    return MVT::v16i8;
16135  if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
16136    return MVT::f128;
16137  if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
16138    return MVT::i64;
16139  if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
16140    return MVT::i32;
16141  return MVT::Other;
16142}
16143
16144LLT AArch64TargetLowering::getOptimalMemOpLLT(
16145    const MemOp &Op, const AttributeList &FuncAttributes) const {
16146  bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
16147  bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
16148  bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
16149  // Only use AdvSIMD to implement memset of 32-byte and above. It would have
16150  // taken one instruction to materialize the v2i64 zero and one store (with
16151  // restrictive addressing mode). Just do i64 stores.
16152  bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
16153  auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
16154    if (Op.isAligned(AlignCheck))
16155      return true;
16156    unsigned Fast;
16157    return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
16158                                          MachineMemOperand::MONone, &Fast) &&
16159           Fast;
16160  };
16161
16162  if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
16163      AlignmentIsAcceptable(MVT::v2i64, Align(16)))
16164    return LLT::fixed_vector(2, 64);
16165  if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
16166    return LLT::scalar(128);
16167  if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
16168    return LLT::scalar(64);
16169  if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
16170    return LLT::scalar(32);
16171  return LLT();
16172}
16173
16174// 12-bit optionally shifted immediates are legal for adds.
16175bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
16176  if (Immed == std::numeric_limits<int64_t>::min()) {
16177    LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
16178                      << ": avoid UB for INT64_MIN\n");
16179    return false;
16180  }
16181  // Same encoding for add/sub, just flip the sign.
16182  Immed = std::abs(Immed);
16183  bool IsLegal = ((Immed >> 12) == 0 ||
16184                  ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
16185  LLVM_DEBUG(dbgs() << "Is " << Immed
16186                    << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
16187  return IsLegal;
16188}
16189
16190// Return false to prevent folding
16191// (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
16192// if the folding leads to worse code.
16193bool AArch64TargetLowering::isMulAddWithConstProfitable(
16194    SDValue AddNode, SDValue ConstNode) const {
16195  // Let the DAGCombiner decide for vector types and large types.
16196  const EVT VT = AddNode.getValueType();
16197  if (VT.isVector() || VT.getScalarSizeInBits() > 64)
16198    return true;
16199
16200  // It is worse if c1 is legal add immediate, while c1*c2 is not
16201  // and has to be composed by at least two instructions.
16202  const ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1));
16203  const ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode);
16204  const int64_t C1 = C1Node->getSExtValue();
16205  const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
16206  if (!isLegalAddImmediate(C1) || isLegalAddImmediate(C1C2.getSExtValue()))
16207    return true;
16208  SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
16209  // Adapt to the width of a register.
16210  unsigned BitSize = VT.getSizeInBits() <= 32 ? 32 : 64;
16211  AArch64_IMM::expandMOVImm(C1C2.getZExtValue(), BitSize, Insn);
16212  if (Insn.size() > 1)
16213    return false;
16214
16215  // Default to true and let the DAGCombiner decide.
16216  return true;
16217}
16218
16219// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
16220// immediates is the same as for an add or a sub.
16221bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
16222  return isLegalAddImmediate(Immed);
16223}
16224
16225/// isLegalAddressingMode - Return true if the addressing mode represented
16226/// by AM is legal for this target, for a load/store of the specified type.
16227bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
16228                                                  const AddrMode &AMode, Type *Ty,
16229                                                  unsigned AS, Instruction *I) const {
16230  // AArch64 has five basic addressing modes:
16231  //  reg
16232  //  reg + 9-bit signed offset
16233  //  reg + SIZE_IN_BYTES * 12-bit unsigned offset
16234  //  reg1 + reg2
16235  //  reg + SIZE_IN_BYTES * reg
16236
16237  // No global is ever allowed as a base.
16238  if (AMode.BaseGV)
16239    return false;
16240
16241  // No reg+reg+imm addressing.
16242  if (AMode.HasBaseReg && AMode.BaseOffs && AMode.Scale)
16243    return false;
16244
16245  // Canonicalise `1*ScaledReg + imm` into `BaseReg + imm` and
16246  // `2*ScaledReg` into `BaseReg + ScaledReg`
16247  AddrMode AM = AMode;
16248  if (AM.Scale && !AM.HasBaseReg) {
16249    if (AM.Scale == 1) {
16250      AM.HasBaseReg = true;
16251      AM.Scale = 0;
16252    } else if (AM.Scale == 2) {
16253      AM.HasBaseReg = true;
16254      AM.Scale = 1;
16255    } else {
16256      return false;
16257    }
16258  }
16259
16260  // A base register is required in all addressing modes.
16261  if (!AM.HasBaseReg)
16262    return false;
16263
16264  if (Ty->isScalableTy()) {
16265    if (isa<ScalableVectorType>(Ty)) {
16266      uint64_t VecElemNumBytes =
16267          DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
16268      return AM.HasBaseReg && !AM.BaseOffs &&
16269             (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
16270    }
16271
16272    return AM.HasBaseReg && !AM.BaseOffs && !AM.Scale;
16273  }
16274
16275  // check reg + imm case:
16276  // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
16277  uint64_t NumBytes = 0;
16278  if (Ty->isSized()) {
16279    uint64_t NumBits = DL.getTypeSizeInBits(Ty);
16280    NumBytes = NumBits / 8;
16281    if (!isPowerOf2_64(NumBits))
16282      NumBytes = 0;
16283  }
16284
16285  return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, AM.BaseOffs,
16286                                                          AM.Scale);
16287}
16288
16289// Check whether the 2 offsets belong to the same imm24 range, and their high
16290// 12bits are same, then their high part can be decoded with the offset of add.
16291int64_t
16292AArch64TargetLowering::getPreferredLargeGEPBaseOffset(int64_t MinOffset,
16293                                                      int64_t MaxOffset) const {
16294  int64_t HighPart = MinOffset & ~0xfffULL;
16295  if (MinOffset >> 12 == MaxOffset >> 12 && isLegalAddImmediate(HighPart)) {
16296    // Rebase the value to an integer multiple of imm12.
16297    return HighPart;
16298  }
16299
16300  return 0;
16301}
16302
16303bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const {
16304  // Consider splitting large offset of struct or array.
16305  return true;
16306}
16307
16308bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(
16309    const MachineFunction &MF, EVT VT) const {
16310  VT = VT.getScalarType();
16311
16312  if (!VT.isSimple())
16313    return false;
16314
16315  switch (VT.getSimpleVT().SimpleTy) {
16316  case MVT::f16:
16317    return Subtarget->hasFullFP16();
16318  case MVT::f32:
16319  case MVT::f64:
16320    return true;
16321  default:
16322    break;
16323  }
16324
16325  return false;
16326}
16327
16328bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F,
16329                                                       Type *Ty) const {
16330  switch (Ty->getScalarType()->getTypeID()) {
16331  case Type::FloatTyID:
16332  case Type::DoubleTyID:
16333    return true;
16334  default:
16335    return false;
16336  }
16337}
16338
16339bool AArch64TargetLowering::generateFMAsInMachineCombiner(
16340    EVT VT, CodeGenOptLevel OptLevel) const {
16341  return (OptLevel >= CodeGenOptLevel::Aggressive) && !VT.isScalableVector() &&
16342         !useSVEForFixedLengthVectorVT(VT);
16343}
16344
16345const MCPhysReg *
16346AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
16347  // LR is a callee-save register, but we must treat it as clobbered by any call
16348  // site. Hence we include LR in the scratch registers, which are in turn added
16349  // as implicit-defs for stackmaps and patchpoints.
16350  static const MCPhysReg ScratchRegs[] = {
16351    AArch64::X16, AArch64::X17, AArch64::LR, 0
16352  };
16353  return ScratchRegs;
16354}
16355
16356ArrayRef<MCPhysReg> AArch64TargetLowering::getRoundingControlRegisters() const {
16357  static const MCPhysReg RCRegs[] = {AArch64::FPCR};
16358  return RCRegs;
16359}
16360
16361bool
16362AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
16363                                                     CombineLevel Level) const {
16364  assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
16365          N->getOpcode() == ISD::SRL) &&
16366         "Expected shift op");
16367
16368  SDValue ShiftLHS = N->getOperand(0);
16369  EVT VT = N->getValueType(0);
16370
16371  // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not
16372  // combine it with shift 'N' to let it be lowered to UBFX except:
16373  // ((x >> C) & mask) << C.
16374  if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
16375      isa<ConstantSDNode>(ShiftLHS.getOperand(1))) {
16376    uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1);
16377    if (isMask_64(TruncMask)) {
16378      SDValue AndLHS = ShiftLHS.getOperand(0);
16379      if (AndLHS.getOpcode() == ISD::SRL) {
16380        if (auto *SRLC = dyn_cast<ConstantSDNode>(AndLHS.getOperand(1))) {
16381          if (N->getOpcode() == ISD::SHL)
16382            if (auto *SHLC = dyn_cast<ConstantSDNode>(N->getOperand(1)))
16383              return SRLC->getZExtValue() == SHLC->getZExtValue();
16384          return false;
16385        }
16386      }
16387    }
16388  }
16389  return true;
16390}
16391
16392bool AArch64TargetLowering::isDesirableToCommuteXorWithShift(
16393    const SDNode *N) const {
16394  assert(N->getOpcode() == ISD::XOR &&
16395         (N->getOperand(0).getOpcode() == ISD::SHL ||
16396          N->getOperand(0).getOpcode() == ISD::SRL) &&
16397         "Expected XOR(SHIFT) pattern");
16398
16399  // Only commute if the entire NOT mask is a hidden shifted mask.
16400  auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
16401  auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
16402  if (XorC && ShiftC) {
16403    unsigned MaskIdx, MaskLen;
16404    if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
16405      unsigned ShiftAmt = ShiftC->getZExtValue();
16406      unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
16407      if (N->getOperand(0).getOpcode() == ISD::SHL)
16408        return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
16409      return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
16410    }
16411  }
16412
16413  return false;
16414}
16415
16416bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask(
16417    const SDNode *N, CombineLevel Level) const {
16418  assert(((N->getOpcode() == ISD::SHL &&
16419           N->getOperand(0).getOpcode() == ISD::SRL) ||
16420          (N->getOpcode() == ISD::SRL &&
16421           N->getOperand(0).getOpcode() == ISD::SHL)) &&
16422         "Expected shift-shift mask");
16423  // Don't allow multiuse shift folding with the same shift amount.
16424  if (!N->getOperand(0)->hasOneUse())
16425    return false;
16426
16427  // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
16428  EVT VT = N->getValueType(0);
16429  if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
16430    auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
16431    auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
16432    return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
16433  }
16434
16435  return true;
16436}
16437
16438bool AArch64TargetLowering::shouldFoldSelectWithIdentityConstant(
16439    unsigned BinOpcode, EVT VT) const {
16440  return VT.isScalableVector() && isTypeLegal(VT);
16441}
16442
16443bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
16444                                                              Type *Ty) const {
16445  assert(Ty->isIntegerTy());
16446
16447  unsigned BitSize = Ty->getPrimitiveSizeInBits();
16448  if (BitSize == 0)
16449    return false;
16450
16451  int64_t Val = Imm.getSExtValue();
16452  if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
16453    return true;
16454
16455  if ((int64_t)Val < 0)
16456    Val = ~Val;
16457  if (BitSize == 32)
16458    Val &= (1LL << 32) - 1;
16459
16460  unsigned Shift = llvm::Log2_64((uint64_t)Val) / 16;
16461  // MOVZ is free so return true for one or fewer MOVK.
16462  return Shift < 3;
16463}
16464
16465bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
16466                                                    unsigned Index) const {
16467  if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
16468    return false;
16469
16470  return (Index == 0 || Index == ResVT.getVectorMinNumElements());
16471}
16472
16473/// Turn vector tests of the signbit in the form of:
16474///   xor (sra X, elt_size(X)-1), -1
16475/// into:
16476///   cmge X, X, #0
16477static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
16478                                         const AArch64Subtarget *Subtarget) {
16479  EVT VT = N->getValueType(0);
16480  if (!Subtarget->hasNEON() || !VT.isVector())
16481    return SDValue();
16482
16483  // There must be a shift right algebraic before the xor, and the xor must be a
16484  // 'not' operation.
16485  SDValue Shift = N->getOperand(0);
16486  SDValue Ones = N->getOperand(1);
16487  if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
16488      !ISD::isBuildVectorAllOnes(Ones.getNode()))
16489    return SDValue();
16490
16491  // The shift should be smearing the sign bit across each vector element.
16492  auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
16493  EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
16494  if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
16495    return SDValue();
16496
16497  return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
16498}
16499
16500// Given a vecreduce_add node, detect the below pattern and convert it to the
16501// node sequence with UABDL, [S|U]ADB and UADDLP.
16502//
16503// i32 vecreduce_add(
16504//  v16i32 abs(
16505//    v16i32 sub(
16506//     v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
16507// =================>
16508// i32 vecreduce_add(
16509//   v4i32 UADDLP(
16510//     v8i16 add(
16511//       v8i16 zext(
16512//         v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
16513//       v8i16 zext(
16514//         v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
16515static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N,
16516                                                    SelectionDAG &DAG) {
16517  // Assumed i32 vecreduce_add
16518  if (N->getValueType(0) != MVT::i32)
16519    return SDValue();
16520
16521  SDValue VecReduceOp0 = N->getOperand(0);
16522  unsigned Opcode = VecReduceOp0.getOpcode();
16523  // Assumed v16i32 abs
16524  if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != MVT::v16i32)
16525    return SDValue();
16526
16527  SDValue ABS = VecReduceOp0;
16528  // Assumed v16i32 sub
16529  if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
16530      ABS->getOperand(0)->getValueType(0) != MVT::v16i32)
16531    return SDValue();
16532
16533  SDValue SUB = ABS->getOperand(0);
16534  unsigned Opcode0 = SUB->getOperand(0).getOpcode();
16535  unsigned Opcode1 = SUB->getOperand(1).getOpcode();
16536  // Assumed v16i32 type
16537  if (SUB->getOperand(0)->getValueType(0) != MVT::v16i32 ||
16538      SUB->getOperand(1)->getValueType(0) != MVT::v16i32)
16539    return SDValue();
16540
16541  // Assumed zext or sext
16542  bool IsZExt = false;
16543  if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
16544    IsZExt = true;
16545  } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
16546    IsZExt = false;
16547  } else
16548    return SDValue();
16549
16550  SDValue EXT0 = SUB->getOperand(0);
16551  SDValue EXT1 = SUB->getOperand(1);
16552  // Assumed zext's operand has v16i8 type
16553  if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
16554      EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
16555    return SDValue();
16556
16557  // Pattern is dectected. Let's convert it to sequence of nodes.
16558  SDLoc DL(N);
16559
16560  // First, create the node pattern of UABD/SABD.
16561  SDValue UABDHigh8Op0 =
16562      DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
16563                  DAG.getConstant(8, DL, MVT::i64));
16564  SDValue UABDHigh8Op1 =
16565      DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
16566                  DAG.getConstant(8, DL, MVT::i64));
16567  SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
16568                                  UABDHigh8Op0, UABDHigh8Op1);
16569  SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
16570
16571  // Second, create the node pattern of UABAL.
16572  SDValue UABDLo8Op0 =
16573      DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
16574                  DAG.getConstant(0, DL, MVT::i64));
16575  SDValue UABDLo8Op1 =
16576      DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
16577                  DAG.getConstant(0, DL, MVT::i64));
16578  SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
16579                                UABDLo8Op0, UABDLo8Op1);
16580  SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
16581  SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
16582
16583  // Third, create the node of UADDLP.
16584  SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
16585
16586  // Fourth, create the node of VECREDUCE_ADD.
16587  return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
16588}
16589
16590// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
16591//   vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
16592//   vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
16593// If we have vectors larger than v16i8 we extract v16i8 vectors,
16594// Follow the same steps above to get DOT instructions concatenate them
16595// and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).
16596static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG,
16597                                          const AArch64Subtarget *ST) {
16598  if (!ST->hasDotProd())
16599    return performVecReduceAddCombineWithUADDLP(N, DAG);
16600
16601  SDValue Op0 = N->getOperand(0);
16602  if (N->getValueType(0) != MVT::i32 || Op0.getValueType().isScalableVT() ||
16603      Op0.getValueType().getVectorElementType() != MVT::i32)
16604    return SDValue();
16605
16606  unsigned ExtOpcode = Op0.getOpcode();
16607  SDValue A = Op0;
16608  SDValue B;
16609  if (ExtOpcode == ISD::MUL) {
16610    A = Op0.getOperand(0);
16611    B = Op0.getOperand(1);
16612    if (A.getOpcode() != B.getOpcode() ||
16613        A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
16614      return SDValue();
16615    ExtOpcode = A.getOpcode();
16616  }
16617  if (ExtOpcode != ISD::ZERO_EXTEND && ExtOpcode != ISD::SIGN_EXTEND)
16618    return SDValue();
16619
16620  EVT Op0VT = A.getOperand(0).getValueType();
16621  bool IsValidElementCount = Op0VT.getVectorNumElements() % 8 == 0;
16622  bool IsValidSize = Op0VT.getScalarSizeInBits() == 8;
16623  if (!IsValidElementCount || !IsValidSize)
16624    return SDValue();
16625
16626  SDLoc DL(Op0);
16627  // For non-mla reductions B can be set to 1. For MLA we take the operand of
16628  // the extend B.
16629  if (!B)
16630    B = DAG.getConstant(1, DL, Op0VT);
16631  else
16632    B = B.getOperand(0);
16633
16634  unsigned IsMultipleOf16 = Op0VT.getVectorNumElements() % 16 == 0;
16635  unsigned NumOfVecReduce;
16636  EVT TargetType;
16637  if (IsMultipleOf16) {
16638    NumOfVecReduce = Op0VT.getVectorNumElements() / 16;
16639    TargetType = MVT::v4i32;
16640  } else {
16641    NumOfVecReduce = Op0VT.getVectorNumElements() / 8;
16642    TargetType = MVT::v2i32;
16643  }
16644  auto DotOpcode =
16645      (ExtOpcode == ISD::ZERO_EXTEND) ? AArch64ISD::UDOT : AArch64ISD::SDOT;
16646  // Handle the case where we need to generate only one Dot operation.
16647  if (NumOfVecReduce == 1) {
16648    SDValue Zeros = DAG.getConstant(0, DL, TargetType);
16649    SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,
16650                              A.getOperand(0), B);
16651    return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
16652  }
16653  // Generate Dot instructions that are multiple of 16.
16654  unsigned VecReduce16Num = Op0VT.getVectorNumElements() / 16;
16655  SmallVector<SDValue, 4> SDotVec16;
16656  unsigned I = 0;
16657  for (; I < VecReduce16Num; I += 1) {
16658    SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32);
16659    SDValue Op0 =
16660        DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, A.getOperand(0),
16661                    DAG.getConstant(I * 16, DL, MVT::i64));
16662    SDValue Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, B,
16663                              DAG.getConstant(I * 16, DL, MVT::i64));
16664    SDValue Dot =
16665        DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Op0, Op1);
16666    SDotVec16.push_back(Dot);
16667  }
16668  // Concatenate dot operations.
16669  EVT SDot16EVT =
16670      EVT::getVectorVT(*DAG.getContext(), MVT::i32, 4 * VecReduce16Num);
16671  SDValue ConcatSDot16 =
16672      DAG.getNode(ISD::CONCAT_VECTORS, DL, SDot16EVT, SDotVec16);
16673  SDValue VecReduceAdd16 =
16674      DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), ConcatSDot16);
16675  unsigned VecReduce8Num = (Op0VT.getVectorNumElements() % 16) / 8;
16676  if (VecReduce8Num == 0)
16677    return VecReduceAdd16;
16678
16679  // Generate the remainder Dot operation that is multiple of 8.
16680  SmallVector<SDValue, 4> SDotVec8;
16681  SDValue Zeros = DAG.getConstant(0, DL, MVT::v2i32);
16682  SDValue Vec8Op0 =
16683      DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, A.getOperand(0),
16684                  DAG.getConstant(I * 16, DL, MVT::i64));
16685  SDValue Vec8Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, B,
16686                                DAG.getConstant(I * 16, DL, MVT::i64));
16687  SDValue Dot =
16688      DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Vec8Op0, Vec8Op1);
16689  SDValue VecReudceAdd8 =
16690      DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
16691  return DAG.getNode(ISD::ADD, DL, N->getValueType(0), VecReduceAdd16,
16692                     VecReudceAdd8);
16693}
16694
16695// Given an (integer) vecreduce, we know the order of the inputs does not
16696// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
16697// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
16698// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
16699static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG) {
16700  auto DetectAddExtract = [&](SDValue A) {
16701    // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
16702    // UADDLP(x) if found.
16703    assert(A.getOpcode() == ISD::ADD);
16704    EVT VT = A.getValueType();
16705    SDValue Op0 = A.getOperand(0);
16706    SDValue Op1 = A.getOperand(1);
16707    if (Op0.getOpcode() != Op0.getOpcode() ||
16708        (Op0.getOpcode() != ISD::ZERO_EXTEND &&
16709         Op0.getOpcode() != ISD::SIGN_EXTEND))
16710      return SDValue();
16711    SDValue Ext0 = Op0.getOperand(0);
16712    SDValue Ext1 = Op1.getOperand(0);
16713    if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
16714        Ext1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
16715        Ext0.getOperand(0) != Ext1.getOperand(0))
16716      return SDValue();
16717    // Check that the type is twice the add types, and the extract are from
16718    // upper/lower parts of the same source.
16719    if (Ext0.getOperand(0).getValueType().getVectorNumElements() !=
16720        VT.getVectorNumElements() * 2)
16721      return SDValue();
16722    if ((Ext0.getConstantOperandVal(1) != 0 ||
16723         Ext1.getConstantOperandVal(1) != VT.getVectorNumElements()) &&
16724        (Ext1.getConstantOperandVal(1) != 0 ||
16725         Ext0.getConstantOperandVal(1) != VT.getVectorNumElements()))
16726      return SDValue();
16727    unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
16728                                                          : AArch64ISD::SADDLP;
16729    return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));
16730  };
16731
16732  if (SDValue R = DetectAddExtract(A))
16733    return R;
16734
16735  if (A.getOperand(0).getOpcode() == ISD::ADD && A.getOperand(0).hasOneUse())
16736    if (SDValue R = performUADDVAddCombine(A.getOperand(0), DAG))
16737      return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
16738                         A.getOperand(1));
16739  if (A.getOperand(1).getOpcode() == ISD::ADD && A.getOperand(1).hasOneUse())
16740    if (SDValue R = performUADDVAddCombine(A.getOperand(1), DAG))
16741      return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
16742                         A.getOperand(0));
16743  return SDValue();
16744}
16745
16746// We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into
16747// UADDLV(concat), where the concat represents the 64-bit zext sources.
16748static SDValue performUADDVZextCombine(SDValue A, SelectionDAG &DAG) {
16749  // Look for add(zext(64-bit source), zext(64-bit source)), returning
16750  // UADDLV(concat(zext, zext)) if found.
16751  assert(A.getOpcode() == ISD::ADD);
16752  EVT VT = A.getValueType();
16753  if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
16754    return SDValue();
16755  SDValue Op0 = A.getOperand(0);
16756  SDValue Op1 = A.getOperand(1);
16757  if (Op0.getOpcode() != ISD::ZERO_EXTEND || Op0.getOpcode() != Op1.getOpcode())
16758    return SDValue();
16759  SDValue Ext0 = Op0.getOperand(0);
16760  SDValue Ext1 = Op1.getOperand(0);
16761  EVT ExtVT0 = Ext0.getValueType();
16762  EVT ExtVT1 = Ext1.getValueType();
16763  // Check zext VTs are the same and 64-bit length.
16764  if (ExtVT0 != ExtVT1 ||
16765      VT.getScalarSizeInBits() != (2 * ExtVT0.getScalarSizeInBits()))
16766    return SDValue();
16767  // Get VT for concat of zext sources.
16768  EVT PairVT = ExtVT0.getDoubleNumVectorElementsVT(*DAG.getContext());
16769  SDValue Concat =
16770      DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(A), PairVT, Ext0, Ext1);
16771
16772  switch (VT.getSimpleVT().SimpleTy) {
16773  case MVT::v2i64:
16774  case MVT::v4i32:
16775    return DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), VT, Concat);
16776  case MVT::v8i16: {
16777    SDValue Uaddlv =
16778        DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), MVT::v4i32, Concat);
16779    return DAG.getNode(AArch64ISD::NVCAST, SDLoc(A), MVT::v8i16, Uaddlv);
16780  }
16781  default:
16782    llvm_unreachable("Unhandled vector type");
16783  }
16784}
16785
16786static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) {
16787  SDValue A = N->getOperand(0);
16788  if (A.getOpcode() == ISD::ADD) {
16789    if (SDValue R = performUADDVAddCombine(A, DAG))
16790      return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
16791    else if (SDValue R = performUADDVZextCombine(A, DAG))
16792      return R;
16793  }
16794  return SDValue();
16795}
16796
16797static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
16798                                 TargetLowering::DAGCombinerInfo &DCI,
16799                                 const AArch64Subtarget *Subtarget) {
16800  if (DCI.isBeforeLegalizeOps())
16801    return SDValue();
16802
16803  return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
16804}
16805
16806SDValue
16807AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
16808                                     SelectionDAG &DAG,
16809                                     SmallVectorImpl<SDNode *> &Created) const {
16810  AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
16811  if (isIntDivCheap(N->getValueType(0), Attr))
16812    return SDValue(N,0); // Lower SDIV as SDIV
16813
16814  EVT VT = N->getValueType(0);
16815
16816  // For scalable and fixed types, mark them as cheap so we can handle it much
16817  // later. This allows us to handle larger than legal types.
16818  if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
16819    return SDValue(N, 0);
16820
16821  // fold (sdiv X, pow2)
16822  if ((VT != MVT::i32 && VT != MVT::i64) ||
16823      !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
16824    return SDValue();
16825
16826  return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
16827}
16828
16829SDValue
16830AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
16831                                     SelectionDAG &DAG,
16832                                     SmallVectorImpl<SDNode *> &Created) const {
16833  AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
16834  if (isIntDivCheap(N->getValueType(0), Attr))
16835    return SDValue(N, 0); // Lower SREM as SREM
16836
16837  EVT VT = N->getValueType(0);
16838
16839  // For scalable and fixed types, mark them as cheap so we can handle it much
16840  // later. This allows us to handle larger than legal types.
16841  if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
16842    return SDValue(N, 0);
16843
16844  // fold (srem X, pow2)
16845  if ((VT != MVT::i32 && VT != MVT::i64) ||
16846      !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
16847    return SDValue();
16848
16849  unsigned Lg2 = Divisor.countr_zero();
16850  if (Lg2 == 0)
16851    return SDValue();
16852
16853  SDLoc DL(N);
16854  SDValue N0 = N->getOperand(0);
16855  SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
16856  SDValue Zero = DAG.getConstant(0, DL, VT);
16857  SDValue CCVal, CSNeg;
16858  if (Lg2 == 1) {
16859    SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL);
16860    SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
16861    CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);
16862
16863    Created.push_back(Cmp.getNode());
16864    Created.push_back(And.getNode());
16865  } else {
16866    SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC);
16867    SDVTList VTs = DAG.getVTList(VT, MVT::i32);
16868
16869    SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
16870    SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
16871    SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);
16872    CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,
16873                        Negs.getValue(1));
16874
16875    Created.push_back(Negs.getNode());
16876    Created.push_back(AndPos.getNode());
16877    Created.push_back(AndNeg.getNode());
16878  }
16879
16880  return CSNeg;
16881}
16882
16883static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
16884  switch(getIntrinsicID(S.getNode())) {
16885  default:
16886    break;
16887  case Intrinsic::aarch64_sve_cntb:
16888    return 8;
16889  case Intrinsic::aarch64_sve_cnth:
16890    return 16;
16891  case Intrinsic::aarch64_sve_cntw:
16892    return 32;
16893  case Intrinsic::aarch64_sve_cntd:
16894    return 64;
16895  }
16896  return {};
16897}
16898
16899/// Calculates what the pre-extend type is, based on the extension
16900/// operation node provided by \p Extend.
16901///
16902/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
16903/// pre-extend type is pulled directly from the operand, while other extend
16904/// operations need a bit more inspection to get this information.
16905///
16906/// \param Extend The SDNode from the DAG that represents the extend operation
16907///
16908/// \returns The type representing the \p Extend source type, or \p MVT::Other
16909/// if no valid type can be determined
16910static EVT calculatePreExtendType(SDValue Extend) {
16911  switch (Extend.getOpcode()) {
16912  case ISD::SIGN_EXTEND:
16913  case ISD::ZERO_EXTEND:
16914    return Extend.getOperand(0).getValueType();
16915  case ISD::AssertSext:
16916  case ISD::AssertZext:
16917  case ISD::SIGN_EXTEND_INREG: {
16918    VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
16919    if (!TypeNode)
16920      return MVT::Other;
16921    return TypeNode->getVT();
16922  }
16923  case ISD::AND: {
16924    ConstantSDNode *Constant =
16925        dyn_cast<ConstantSDNode>(Extend.getOperand(1).getNode());
16926    if (!Constant)
16927      return MVT::Other;
16928
16929    uint32_t Mask = Constant->getZExtValue();
16930
16931    if (Mask == UCHAR_MAX)
16932      return MVT::i8;
16933    else if (Mask == USHRT_MAX)
16934      return MVT::i16;
16935    else if (Mask == UINT_MAX)
16936      return MVT::i32;
16937
16938    return MVT::Other;
16939  }
16940  default:
16941    return MVT::Other;
16942  }
16943}
16944
16945/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
16946/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
16947/// SExt/ZExt rather than the scalar SExt/ZExt
16948static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG) {
16949  EVT VT = BV.getValueType();
16950  if (BV.getOpcode() != ISD::BUILD_VECTOR &&
16951      BV.getOpcode() != ISD::VECTOR_SHUFFLE)
16952    return SDValue();
16953
16954  // Use the first item in the buildvector/shuffle to get the size of the
16955  // extend, and make sure it looks valid.
16956  SDValue Extend = BV->getOperand(0);
16957  unsigned ExtendOpcode = Extend.getOpcode();
16958  bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
16959                ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
16960                ExtendOpcode == ISD::AssertSext;
16961  if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
16962      ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
16963    return SDValue();
16964  // Shuffle inputs are vector, limit to SIGN_EXTEND and ZERO_EXTEND to ensure
16965  // calculatePreExtendType will work without issue.
16966  if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
16967      ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
16968    return SDValue();
16969
16970  // Restrict valid pre-extend data type
16971  EVT PreExtendType = calculatePreExtendType(Extend);
16972  if (PreExtendType == MVT::Other ||
16973      PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
16974    return SDValue();
16975
16976  // Make sure all other operands are equally extended
16977  for (SDValue Op : drop_begin(BV->ops())) {
16978    if (Op.isUndef())
16979      continue;
16980    unsigned Opc = Op.getOpcode();
16981    bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
16982                     Opc == ISD::AssertSext;
16983    if (OpcIsSExt != IsSExt || calculatePreExtendType(Op) != PreExtendType)
16984      return SDValue();
16985  }
16986
16987  SDValue NBV;
16988  SDLoc DL(BV);
16989  if (BV.getOpcode() == ISD::BUILD_VECTOR) {
16990    EVT PreExtendVT = VT.changeVectorElementType(PreExtendType);
16991    EVT PreExtendLegalType =
16992        PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
16993    SmallVector<SDValue, 8> NewOps;
16994    for (SDValue Op : BV->ops())
16995      NewOps.push_back(Op.isUndef() ? DAG.getUNDEF(PreExtendLegalType)
16996                                    : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL,
16997                                                           PreExtendLegalType));
16998    NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
16999  } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
17000    EVT PreExtendVT = VT.changeVectorElementType(PreExtendType.getScalarType());
17001    NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0),
17002                               BV.getOperand(1).isUndef()
17003                                   ? DAG.getUNDEF(PreExtendVT)
17004                                   : BV.getOperand(1).getOperand(0),
17005                               cast<ShuffleVectorSDNode>(BV)->getMask());
17006  }
17007  return DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, VT, NBV);
17008}
17009
17010/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
17011/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
17012static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) {
17013  // If the value type isn't a vector, none of the operands are going to be dups
17014  EVT VT = Mul->getValueType(0);
17015  if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
17016    return SDValue();
17017
17018  SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG);
17019  SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG);
17020
17021  // Neither operands have been changed, don't make any further changes
17022  if (!Op0 && !Op1)
17023    return SDValue();
17024
17025  SDLoc DL(Mul);
17026  return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0),
17027                     Op1 ? Op1 : Mul->getOperand(1));
17028}
17029
17030// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
17031// Same for other types with equivalent constants.
17032static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG) {
17033  EVT VT = N->getValueType(0);
17034  if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
17035      VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
17036    return SDValue();
17037  if (N->getOperand(0).getOpcode() != ISD::AND ||
17038      N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
17039    return SDValue();
17040
17041  SDValue And = N->getOperand(0);
17042  SDValue Srl = And.getOperand(0);
17043
17044  APInt V1, V2, V3;
17045  if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
17046      !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
17047      !ISD::isConstantSplatVector(Srl.getOperand(1).getNode(), V3))
17048    return SDValue();
17049
17050  unsigned HalfSize = VT.getScalarSizeInBits() / 2;
17051  if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
17052      V3 != (HalfSize - 1))
17053    return SDValue();
17054
17055  EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
17056                                EVT::getIntegerVT(*DAG.getContext(), HalfSize),
17057                                VT.getVectorElementCount() * 2);
17058
17059  SDLoc DL(N);
17060  SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0));
17061  SDValue CM = DAG.getNode(AArch64ISD::CMLTz, DL, HalfVT, In);
17062  return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);
17063}
17064
17065static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
17066                                 TargetLowering::DAGCombinerInfo &DCI,
17067                                 const AArch64Subtarget *Subtarget) {
17068
17069  if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
17070    return Ext;
17071  if (SDValue Ext = performMulVectorCmpZeroCombine(N, DAG))
17072    return Ext;
17073
17074  if (DCI.isBeforeLegalizeOps())
17075    return SDValue();
17076
17077  // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
17078  // and in MachineCombiner pass, add+mul will be combined into madd.
17079  // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
17080  SDLoc DL(N);
17081  EVT VT = N->getValueType(0);
17082  SDValue N0 = N->getOperand(0);
17083  SDValue N1 = N->getOperand(1);
17084  SDValue MulOper;
17085  unsigned AddSubOpc;
17086
17087  auto IsAddSubWith1 = [&](SDValue V) -> bool {
17088    AddSubOpc = V->getOpcode();
17089    if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {
17090      SDValue Opnd = V->getOperand(1);
17091      MulOper = V->getOperand(0);
17092      if (AddSubOpc == ISD::SUB)
17093        std::swap(Opnd, MulOper);
17094      if (auto C = dyn_cast<ConstantSDNode>(Opnd))
17095        return C->isOne();
17096    }
17097    return false;
17098  };
17099
17100  if (IsAddSubWith1(N0)) {
17101    SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper);
17102    return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal);
17103  }
17104
17105  if (IsAddSubWith1(N1)) {
17106    SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper);
17107    return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal);
17108  }
17109
17110  // The below optimizations require a constant RHS.
17111  if (!isa<ConstantSDNode>(N1))
17112    return SDValue();
17113
17114  ConstantSDNode *C = cast<ConstantSDNode>(N1);
17115  const APInt &ConstValue = C->getAPIntValue();
17116
17117  // Allow the scaling to be folded into the `cnt` instruction by preventing
17118  // the scaling to be obscured here. This makes it easier to pattern match.
17119  if (IsSVECntIntrinsic(N0) ||
17120     (N0->getOpcode() == ISD::TRUNCATE &&
17121      (IsSVECntIntrinsic(N0->getOperand(0)))))
17122       if (ConstValue.sge(1) && ConstValue.sle(16))
17123         return SDValue();
17124
17125  // Multiplication of a power of two plus/minus one can be done more
17126  // cheaply as shift+add/sub. For now, this is true unilaterally. If
17127  // future CPUs have a cheaper MADD instruction, this may need to be
17128  // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
17129  // 64-bit is 5 cycles, so this is always a win.
17130  // More aggressively, some multiplications N0 * C can be lowered to
17131  // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
17132  // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8)
17133  // TODO: lower more cases.
17134
17135  // TrailingZeroes is used to test if the mul can be lowered to
17136  // shift+add+shift.
17137  unsigned TrailingZeroes = ConstValue.countr_zero();
17138  if (TrailingZeroes) {
17139    // Conservatively do not lower to shift+add+shift if the mul might be
17140    // folded into smul or umul.
17141    if (N0->hasOneUse() && (isSignExtended(N0, DAG) ||
17142                            isZeroExtended(N0, DAG)))
17143      return SDValue();
17144    // Conservatively do not lower to shift+add+shift if the mul might be
17145    // folded into madd or msub.
17146    if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD ||
17147                           N->use_begin()->getOpcode() == ISD::SUB))
17148      return SDValue();
17149  }
17150  // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
17151  // and shift+add+shift.
17152  APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
17153  unsigned ShiftAmt;
17154
17155  auto Shl = [&](SDValue N0, unsigned N1) {
17156    SDValue RHS = DAG.getConstant(N1, DL, MVT::i64);
17157    return DAG.getNode(ISD::SHL, DL, VT, N0, RHS);
17158  };
17159  auto Add = [&](SDValue N0, SDValue N1) {
17160    return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
17161  };
17162  auto Sub = [&](SDValue N0, SDValue N1) {
17163    return DAG.getNode(ISD::SUB, DL, VT, N0, N1);
17164  };
17165  auto Negate = [&](SDValue N) {
17166    SDValue Zero = DAG.getConstant(0, DL, VT);
17167    return DAG.getNode(ISD::SUB, DL, VT, Zero, N);
17168  };
17169
17170  // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg:
17171  // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as
17172  // the (2^N - 1) can't be execused via a single instruction.
17173  auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
17174    unsigned BitWidth = C.getBitWidth();
17175    for (unsigned i = 1; i < BitWidth / 2; i++) {
17176      APInt Rem;
17177      APInt X(BitWidth, (1 << i) + 1);
17178      APInt::sdivrem(C, X, N, Rem);
17179      APInt NVMinus1 = N - 1;
17180      if (Rem == 0 && NVMinus1.isPowerOf2()) {
17181        M = X;
17182        return true;
17183      }
17184    }
17185    return false;
17186  };
17187
17188  if (ConstValue.isNonNegative()) {
17189    // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
17190    // (mul x, 2^N - 1) => (sub (shl x, N), x)
17191    // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))
17192    // (mul x, (2^M + 1) * (2^N + 1))
17193    //     => MV = (add (shl x, M), x); (add (shl MV, N), MV)
17194    APInt SCVMinus1 = ShiftedConstValue - 1;
17195    APInt SCVPlus1 = ShiftedConstValue + 1;
17196    APInt CVPlus1 = ConstValue + 1;
17197    APInt CVM, CVN;
17198    if (SCVMinus1.isPowerOf2()) {
17199      ShiftAmt = SCVMinus1.logBase2();
17200      return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes);
17201    } else if (CVPlus1.isPowerOf2()) {
17202      ShiftAmt = CVPlus1.logBase2();
17203      return Sub(Shl(N0, ShiftAmt), N0);
17204    } else if (SCVPlus1.isPowerOf2()) {
17205      ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
17206      return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
17207    } else if (Subtarget->hasALULSLFast() &&
17208               isPowPlusPlusConst(ConstValue, CVM, CVN)) {
17209      APInt CVMMinus1 = CVM - 1;
17210      APInt CVNMinus1 = CVN - 1;
17211      unsigned ShiftM1 = CVMMinus1.logBase2();
17212      unsigned ShiftN1 = CVNMinus1.logBase2();
17213      // LSLFast implicate that Shifts <= 3 places are fast
17214      if (ShiftM1 <= 3 && ShiftN1 <= 3) {
17215        SDValue MVal = Add(Shl(N0, ShiftM1), N0);
17216        return Add(Shl(MVal, ShiftN1), MVal);
17217      }
17218    }
17219  } else {
17220    // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
17221    // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
17222    // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N))
17223    APInt SCVPlus1 = -ShiftedConstValue + 1;
17224    APInt CVNegPlus1 = -ConstValue + 1;
17225    APInt CVNegMinus1 = -ConstValue - 1;
17226    if (CVNegPlus1.isPowerOf2()) {
17227      ShiftAmt = CVNegPlus1.logBase2();
17228      return Sub(N0, Shl(N0, ShiftAmt));
17229    } else if (CVNegMinus1.isPowerOf2()) {
17230      ShiftAmt = CVNegMinus1.logBase2();
17231      return Negate(Add(Shl(N0, ShiftAmt), N0));
17232    } else if (SCVPlus1.isPowerOf2()) {
17233      ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
17234      return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt));
17235    }
17236  }
17237
17238  return SDValue();
17239}
17240
17241static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
17242                                                         SelectionDAG &DAG) {
17243  // Take advantage of vector comparisons producing 0 or -1 in each lane to
17244  // optimize away operation when it's from a constant.
17245  //
17246  // The general transformation is:
17247  //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
17248  //       AND(VECTOR_CMP(x,y), constant2)
17249  //    constant2 = UNARYOP(constant)
17250
17251  // Early exit if this isn't a vector operation, the operand of the
17252  // unary operation isn't a bitwise AND, or if the sizes of the operations
17253  // aren't the same.
17254  EVT VT = N->getValueType(0);
17255  if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
17256      N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
17257      VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
17258    return SDValue();
17259
17260  // Now check that the other operand of the AND is a constant. We could
17261  // make the transformation for non-constant splats as well, but it's unclear
17262  // that would be a benefit as it would not eliminate any operations, just
17263  // perform one more step in scalar code before moving to the vector unit.
17264  if (BuildVectorSDNode *BV =
17265          dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
17266    // Bail out if the vector isn't a constant.
17267    if (!BV->isConstant())
17268      return SDValue();
17269
17270    // Everything checks out. Build up the new and improved node.
17271    SDLoc DL(N);
17272    EVT IntVT = BV->getValueType(0);
17273    // Create a new constant of the appropriate type for the transformed
17274    // DAG.
17275    SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
17276    // The AND node needs bitcasts to/from an integer vector type around it.
17277    SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
17278    SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
17279                                 N->getOperand(0)->getOperand(0), MaskConst);
17280    SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
17281    return Res;
17282  }
17283
17284  return SDValue();
17285}
17286
17287static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
17288                                     const AArch64Subtarget *Subtarget) {
17289  // First try to optimize away the conversion when it's conditionally from
17290  // a constant. Vectors only.
17291  if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
17292    return Res;
17293
17294  EVT VT = N->getValueType(0);
17295  if (VT != MVT::f32 && VT != MVT::f64)
17296    return SDValue();
17297
17298  // Only optimize when the source and destination types have the same width.
17299  if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
17300    return SDValue();
17301
17302  // If the result of an integer load is only used by an integer-to-float
17303  // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
17304  // This eliminates an "integer-to-vector-move" UOP and improves throughput.
17305  SDValue N0 = N->getOperand(0);
17306  if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N0.getNode()) &&
17307      N0.hasOneUse() &&
17308      // Do not change the width of a volatile load.
17309      !cast<LoadSDNode>(N0)->isVolatile()) {
17310    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
17311    SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
17312                               LN0->getPointerInfo(), LN0->getAlign(),
17313                               LN0->getMemOperand()->getFlags());
17314
17315    // Make sure successors of the original load stay after it by updating them
17316    // to use the new Chain.
17317    DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
17318
17319    unsigned Opcode =
17320        (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
17321    return DAG.getNode(Opcode, SDLoc(N), VT, Load);
17322  }
17323
17324  return SDValue();
17325}
17326
17327/// Fold a floating-point multiply by power of two into floating-point to
17328/// fixed-point conversion.
17329static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
17330                                     TargetLowering::DAGCombinerInfo &DCI,
17331                                     const AArch64Subtarget *Subtarget) {
17332  if (!Subtarget->isNeonAvailable())
17333    return SDValue();
17334
17335  if (!N->getValueType(0).isSimple())
17336    return SDValue();
17337
17338  SDValue Op = N->getOperand(0);
17339  if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL)
17340    return SDValue();
17341
17342  if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
17343    return SDValue();
17344
17345  SDValue ConstVec = Op->getOperand(1);
17346  if (!isa<BuildVectorSDNode>(ConstVec))
17347    return SDValue();
17348
17349  MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
17350  uint32_t FloatBits = FloatTy.getSizeInBits();
17351  if (FloatBits != 32 && FloatBits != 64 &&
17352      (FloatBits != 16 || !Subtarget->hasFullFP16()))
17353    return SDValue();
17354
17355  MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
17356  uint32_t IntBits = IntTy.getSizeInBits();
17357  if (IntBits != 16 && IntBits != 32 && IntBits != 64)
17358    return SDValue();
17359
17360  // Avoid conversions where iN is larger than the float (e.g., float -> i64).
17361  if (IntBits > FloatBits)
17362    return SDValue();
17363
17364  BitVector UndefElements;
17365  BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
17366  int32_t Bits = IntBits == 64 ? 64 : 32;
17367  int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
17368  if (C == -1 || C == 0 || C > Bits)
17369    return SDValue();
17370
17371  EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();
17372  if (!DAG.getTargetLoweringInfo().isTypeLegal(ResTy))
17373    return SDValue();
17374
17375  if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||
17376      N->getOpcode() == ISD::FP_TO_UINT_SAT) {
17377    EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT();
17378    if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits)
17379      return SDValue();
17380  }
17381
17382  SDLoc DL(N);
17383  bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT ||
17384                   N->getOpcode() == ISD::FP_TO_SINT_SAT);
17385  unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
17386                                      : Intrinsic::aarch64_neon_vcvtfp2fxu;
17387  SDValue FixConv =
17388      DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy,
17389                  DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
17390                  Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
17391  // We can handle smaller integers by generating an extra trunc.
17392  if (IntBits < FloatBits)
17393    FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
17394
17395  return FixConv;
17396}
17397
17398/// Fold a floating-point divide by power of two into fixed-point to
17399/// floating-point conversion.
17400static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG,
17401                                  TargetLowering::DAGCombinerInfo &DCI,
17402                                  const AArch64Subtarget *Subtarget) {
17403  if (!Subtarget->hasNEON())
17404    return SDValue();
17405
17406  SDValue Op = N->getOperand(0);
17407  unsigned Opc = Op->getOpcode();
17408  if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
17409      !Op.getOperand(0).getValueType().isSimple() ||
17410      (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
17411    return SDValue();
17412
17413  SDValue ConstVec = N->getOperand(1);
17414  if (!isa<BuildVectorSDNode>(ConstVec))
17415    return SDValue();
17416
17417  MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
17418  int32_t IntBits = IntTy.getSizeInBits();
17419  if (IntBits != 16 && IntBits != 32 && IntBits != 64)
17420    return SDValue();
17421
17422  MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
17423  int32_t FloatBits = FloatTy.getSizeInBits();
17424  if (FloatBits != 32 && FloatBits != 64)
17425    return SDValue();
17426
17427  // Avoid conversions where iN is larger than the float (e.g., i64 -> float).
17428  if (IntBits > FloatBits)
17429    return SDValue();
17430
17431  BitVector UndefElements;
17432  BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
17433  int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1);
17434  if (C == -1 || C == 0 || C > FloatBits)
17435    return SDValue();
17436
17437  MVT ResTy;
17438  unsigned NumLanes = Op.getValueType().getVectorNumElements();
17439  switch (NumLanes) {
17440  default:
17441    return SDValue();
17442  case 2:
17443    ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
17444    break;
17445  case 4:
17446    ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
17447    break;
17448  }
17449
17450  if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
17451    return SDValue();
17452
17453  SDLoc DL(N);
17454  SDValue ConvInput = Op.getOperand(0);
17455  bool IsSigned = Opc == ISD::SINT_TO_FP;
17456  if (IntBits < FloatBits)
17457    ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
17458                            ResTy, ConvInput);
17459
17460  unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp
17461                                      : Intrinsic::aarch64_neon_vcvtfxu2fp;
17462  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
17463                     DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
17464                     DAG.getConstant(C, DL, MVT::i32));
17465}
17466
17467static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
17468                               const AArch64TargetLowering &TLI) {
17469  EVT VT = N->getValueType(0);
17470  SelectionDAG &DAG = DCI.DAG;
17471  SDLoc DL(N);
17472
17473  if (!VT.isVector())
17474    return SDValue();
17475
17476  // The combining code currently only works for NEON vectors. In particular,
17477  // it does not work for SVE when dealing with vectors wider than 128 bits.
17478  // It also doesn't work for streaming mode because it causes generating
17479  // bsl instructions that are invalid in streaming mode.
17480  if (TLI.useSVEForFixedLengthVectorVT(
17481          VT, !DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable()))
17482    return SDValue();
17483
17484  SDValue N0 = N->getOperand(0);
17485  if (N0.getOpcode() != ISD::AND)
17486    return SDValue();
17487
17488  SDValue N1 = N->getOperand(1);
17489  if (N1.getOpcode() != ISD::AND)
17490    return SDValue();
17491
17492  // InstCombine does (not (neg a)) => (add a -1).
17493  // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
17494  // Loop over all combinations of AND operands.
17495  for (int i = 1; i >= 0; --i) {
17496    for (int j = 1; j >= 0; --j) {
17497      SDValue O0 = N0->getOperand(i);
17498      SDValue O1 = N1->getOperand(j);
17499      SDValue Sub, Add, SubSibling, AddSibling;
17500
17501      // Find a SUB and an ADD operand, one from each AND.
17502      if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
17503        Sub = O0;
17504        Add = O1;
17505        SubSibling = N0->getOperand(1 - i);
17506        AddSibling = N1->getOperand(1 - j);
17507      } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
17508        Add = O0;
17509        Sub = O1;
17510        AddSibling = N0->getOperand(1 - i);
17511        SubSibling = N1->getOperand(1 - j);
17512      } else
17513        continue;
17514
17515      if (!ISD::isBuildVectorAllZeros(Sub.getOperand(0).getNode()))
17516        continue;
17517
17518      // Constant ones is always righthand operand of the Add.
17519      if (!ISD::isBuildVectorAllOnes(Add.getOperand(1).getNode()))
17520        continue;
17521
17522      if (Sub.getOperand(1) != Add.getOperand(0))
17523        continue;
17524
17525      return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
17526    }
17527  }
17528
17529  // (or (and a b) (and (not a) c)) => (bsl a b c)
17530  // We only have to look for constant vectors here since the general, variable
17531  // case can be handled in TableGen.
17532  unsigned Bits = VT.getScalarSizeInBits();
17533  uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
17534  for (int i = 1; i >= 0; --i)
17535    for (int j = 1; j >= 0; --j) {
17536      BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
17537      BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
17538      if (!BVN0 || !BVN1)
17539        continue;
17540
17541      bool FoundMatch = true;
17542      for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
17543        ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
17544        ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
17545        if (!CN0 || !CN1 ||
17546            CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
17547          FoundMatch = false;
17548          break;
17549        }
17550      }
17551
17552      if (FoundMatch)
17553        return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(BVN0, 0),
17554                           N0->getOperand(1 - i), N1->getOperand(1 - j));
17555    }
17556
17557  return SDValue();
17558}
17559
17560// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
17561// convert to csel(ccmp(.., cc0)), depending on cc1:
17562
17563// (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
17564// =>
17565// (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
17566//
17567// (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
17568// =>
17569// (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
17570static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG) {
17571  EVT VT = N->getValueType(0);
17572  SDValue CSel0 = N->getOperand(0);
17573  SDValue CSel1 = N->getOperand(1);
17574
17575  if (CSel0.getOpcode() != AArch64ISD::CSEL ||
17576      CSel1.getOpcode() != AArch64ISD::CSEL)
17577    return SDValue();
17578
17579  if (!CSel0->hasOneUse() || !CSel1->hasOneUse())
17580    return SDValue();
17581
17582  if (!isNullConstant(CSel0.getOperand(0)) ||
17583      !isOneConstant(CSel0.getOperand(1)) ||
17584      !isNullConstant(CSel1.getOperand(0)) ||
17585      !isOneConstant(CSel1.getOperand(1)))
17586    return SDValue();
17587
17588  SDValue Cmp0 = CSel0.getOperand(3);
17589  SDValue Cmp1 = CSel1.getOperand(3);
17590  AArch64CC::CondCode CC0 = (AArch64CC::CondCode)CSel0.getConstantOperandVal(2);
17591  AArch64CC::CondCode CC1 = (AArch64CC::CondCode)CSel1.getConstantOperandVal(2);
17592  if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
17593    return SDValue();
17594  if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
17595      Cmp0.getOpcode() == AArch64ISD::SUBS) {
17596    std::swap(Cmp0, Cmp1);
17597    std::swap(CC0, CC1);
17598  }
17599
17600  if (Cmp1.getOpcode() != AArch64ISD::SUBS)
17601    return SDValue();
17602
17603  SDLoc DL(N);
17604  SDValue CCmp, Condition;
17605  unsigned NZCV;
17606
17607  if (N->getOpcode() == ISD::AND) {
17608    AArch64CC::CondCode InvCC0 = AArch64CC::getInvertedCondCode(CC0);
17609    Condition = DAG.getConstant(InvCC0, DL, MVT_CC);
17610    NZCV = AArch64CC::getNZCVToSatisfyCondCode(CC1);
17611  } else {
17612    AArch64CC::CondCode InvCC1 = AArch64CC::getInvertedCondCode(CC1);
17613    Condition = DAG.getConstant(CC0, DL, MVT_CC);
17614    NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvCC1);
17615  }
17616
17617  SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
17618
17619  auto *Op1 = dyn_cast<ConstantSDNode>(Cmp1.getOperand(1));
17620  if (Op1 && Op1->getAPIntValue().isNegative() &&
17621      Op1->getAPIntValue().sgt(-32)) {
17622    // CCMP accept the constant int the range [0, 31]
17623    // if the Op1 is a constant in the range [-31, -1], we
17624    // can select to CCMN to avoid the extra mov
17625    SDValue AbsOp1 =
17626        DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0));
17627    CCmp = DAG.getNode(AArch64ISD::CCMN, DL, MVT_CC, Cmp1.getOperand(0), AbsOp1,
17628                       NZCVOp, Condition, Cmp0);
17629  } else {
17630    CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0),
17631                       Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
17632  }
17633  return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
17634                     CSel0.getOperand(1), DAG.getConstant(CC1, DL, MVT::i32),
17635                     CCmp);
17636}
17637
17638static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
17639                                const AArch64Subtarget *Subtarget,
17640                                const AArch64TargetLowering &TLI) {
17641  SelectionDAG &DAG = DCI.DAG;
17642  EVT VT = N->getValueType(0);
17643
17644  if (SDValue R = performANDORCSELCombine(N, DAG))
17645    return R;
17646
17647  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
17648    return SDValue();
17649
17650  if (SDValue Res = tryCombineToBSL(N, DCI, TLI))
17651    return Res;
17652
17653  return SDValue();
17654}
17655
17656static bool isConstantSplatVectorMaskForType(SDNode *N, EVT MemVT) {
17657  if (!MemVT.getVectorElementType().isSimple())
17658    return false;
17659
17660  uint64_t MaskForTy = 0ull;
17661  switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
17662  case MVT::i8:
17663    MaskForTy = 0xffull;
17664    break;
17665  case MVT::i16:
17666    MaskForTy = 0xffffull;
17667    break;
17668  case MVT::i32:
17669    MaskForTy = 0xffffffffull;
17670    break;
17671  default:
17672    return false;
17673    break;
17674  }
17675
17676  if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
17677    if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
17678      return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
17679
17680  return false;
17681}
17682
17683static SDValue performReinterpretCastCombine(SDNode *N) {
17684  SDValue LeafOp = SDValue(N, 0);
17685  SDValue Op = N->getOperand(0);
17686  while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST &&
17687         LeafOp.getValueType() != Op.getValueType())
17688    Op = Op->getOperand(0);
17689  if (LeafOp.getValueType() == Op.getValueType())
17690    return Op;
17691  return SDValue();
17692}
17693
17694static SDValue performSVEAndCombine(SDNode *N,
17695                                    TargetLowering::DAGCombinerInfo &DCI) {
17696  if (DCI.isBeforeLegalizeOps())
17697    return SDValue();
17698
17699  SelectionDAG &DAG = DCI.DAG;
17700  SDValue Src = N->getOperand(0);
17701  unsigned Opc = Src->getOpcode();
17702
17703  // Zero/any extend of an unsigned unpack
17704  if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
17705    SDValue UnpkOp = Src->getOperand(0);
17706    SDValue Dup = N->getOperand(1);
17707
17708    if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
17709      return SDValue();
17710
17711    SDLoc DL(N);
17712    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0));
17713    if (!C)
17714      return SDValue();
17715
17716    uint64_t ExtVal = C->getZExtValue();
17717
17718    auto MaskAndTypeMatch = [ExtVal](EVT VT) -> bool {
17719      return ((ExtVal == 0xFF && VT == MVT::i8) ||
17720              (ExtVal == 0xFFFF && VT == MVT::i16) ||
17721              (ExtVal == 0xFFFFFFFF && VT == MVT::i32));
17722    };
17723
17724    // If the mask is fully covered by the unpack, we don't need to push
17725    // a new AND onto the operand
17726    EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
17727    if (MaskAndTypeMatch(EltTy))
17728      return Src;
17729
17730    // If this is 'and (uunpklo/hi (extload MemTy -> ExtTy)), mask', then check
17731    // to see if the mask is all-ones of size MemTy.
17732    auto MaskedLoadOp = dyn_cast<MaskedLoadSDNode>(UnpkOp);
17733    if (MaskedLoadOp && (MaskedLoadOp->getExtensionType() == ISD::ZEXTLOAD ||
17734                         MaskedLoadOp->getExtensionType() == ISD::EXTLOAD)) {
17735      EVT EltTy = MaskedLoadOp->getMemoryVT().getVectorElementType();
17736      if (MaskAndTypeMatch(EltTy))
17737        return Src;
17738    }
17739
17740    // Truncate to prevent a DUP with an over wide constant
17741    APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
17742
17743    // Otherwise, make sure we propagate the AND to the operand
17744    // of the unpack
17745    Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0),
17746                      DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
17747
17748    SDValue And = DAG.getNode(ISD::AND, DL,
17749                              UnpkOp->getValueType(0), UnpkOp, Dup);
17750
17751    return DAG.getNode(Opc, DL, N->getValueType(0), And);
17752  }
17753
17754  // If both sides of AND operations are i1 splat_vectors then
17755  // we can produce just i1 splat_vector as the result.
17756  if (isAllActivePredicate(DAG, N->getOperand(0)))
17757    return N->getOperand(1);
17758  if (isAllActivePredicate(DAG, N->getOperand(1)))
17759    return N->getOperand(0);
17760
17761  if (!EnableCombineMGatherIntrinsics)
17762    return SDValue();
17763
17764  SDValue Mask = N->getOperand(1);
17765
17766  if (!Src.hasOneUse())
17767    return SDValue();
17768
17769  EVT MemVT;
17770
17771  // SVE load instructions perform an implicit zero-extend, which makes them
17772  // perfect candidates for combining.
17773  switch (Opc) {
17774  case AArch64ISD::LD1_MERGE_ZERO:
17775  case AArch64ISD::LDNF1_MERGE_ZERO:
17776  case AArch64ISD::LDFF1_MERGE_ZERO:
17777    MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
17778    break;
17779  case AArch64ISD::GLD1_MERGE_ZERO:
17780  case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
17781  case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
17782  case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
17783  case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
17784  case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
17785  case AArch64ISD::GLD1_IMM_MERGE_ZERO:
17786  case AArch64ISD::GLDFF1_MERGE_ZERO:
17787  case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
17788  case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
17789  case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
17790  case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
17791  case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
17792  case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
17793  case AArch64ISD::GLDNT1_MERGE_ZERO:
17794    MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
17795    break;
17796  default:
17797    return SDValue();
17798  }
17799
17800  if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
17801    return Src;
17802
17803  return SDValue();
17804}
17805
17806// Transform and(fcmp(a, b), fcmp(c, d)) into fccmp(fcmp(a, b), c, d)
17807static SDValue performANDSETCCCombine(SDNode *N,
17808                                      TargetLowering::DAGCombinerInfo &DCI) {
17809
17810  // This function performs an optimization on a specific pattern involving
17811  // an AND operation and SETCC (Set Condition Code) node.
17812
17813  SDValue SetCC = N->getOperand(0);
17814  EVT VT = N->getValueType(0);
17815  SelectionDAG &DAG = DCI.DAG;
17816
17817  // Checks if the current node (N) is used by any SELECT instruction and
17818  // returns an empty SDValue to avoid applying the optimization to prevent
17819  // incorrect results
17820  for (auto U : N->uses())
17821    if (U->getOpcode() == ISD::SELECT)
17822      return SDValue();
17823
17824  // Check if the operand is a SETCC node with floating-point comparison
17825  if (SetCC.getOpcode() == ISD::SETCC &&
17826      SetCC.getOperand(0).getValueType() == MVT::f32) {
17827
17828    SDValue Cmp;
17829    AArch64CC::CondCode CC;
17830
17831    // Check if the DAG is after legalization and if we can emit the conjunction
17832    if (!DCI.isBeforeLegalize() &&
17833        (Cmp = emitConjunction(DAG, SDValue(N, 0), CC))) {
17834
17835      AArch64CC::CondCode InvertedCC = AArch64CC::getInvertedCondCode(CC);
17836
17837      SDLoc DL(N);
17838      return DAG.getNode(AArch64ISD::CSINC, DL, VT, DAG.getConstant(0, DL, VT),
17839                         DAG.getConstant(0, DL, VT),
17840                         DAG.getConstant(InvertedCC, DL, MVT::i32), Cmp);
17841    }
17842  }
17843  return SDValue();
17844}
17845
17846static SDValue performANDCombine(SDNode *N,
17847                                 TargetLowering::DAGCombinerInfo &DCI) {
17848  SelectionDAG &DAG = DCI.DAG;
17849  SDValue LHS = N->getOperand(0);
17850  SDValue RHS = N->getOperand(1);
17851  EVT VT = N->getValueType(0);
17852
17853  if (SDValue R = performANDORCSELCombine(N, DAG))
17854    return R;
17855
17856  if (SDValue R = performANDSETCCCombine(N,DCI))
17857    return R;
17858
17859  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
17860    return SDValue();
17861
17862  if (VT.isScalableVector())
17863    return performSVEAndCombine(N, DCI);
17864
17865  // The combining code below works only for NEON vectors. In particular, it
17866  // does not work for SVE when dealing with vectors wider than 128 bits.
17867  if (!VT.is64BitVector() && !VT.is128BitVector())
17868    return SDValue();
17869
17870  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
17871  if (!BVN)
17872    return SDValue();
17873
17874  // AND does not accept an immediate, so check if we can use a BIC immediate
17875  // instruction instead. We do this here instead of using a (and x, (mvni imm))
17876  // pattern in isel, because some immediates may be lowered to the preferred
17877  // (and x, (movi imm)) form, even though an mvni representation also exists.
17878  APInt DefBits(VT.getSizeInBits(), 0);
17879  APInt UndefBits(VT.getSizeInBits(), 0);
17880  if (resolveBuildVector(BVN, DefBits, UndefBits)) {
17881    SDValue NewOp;
17882
17883    // Any bits known to already be 0 need not be cleared again, which can help
17884    // reduce the size of the immediate to one supported by the instruction.
17885    KnownBits Known = DAG.computeKnownBits(LHS);
17886    APInt ZeroSplat(VT.getSizeInBits(), 0);
17887    for (unsigned I = 0; I < VT.getSizeInBits() / Known.Zero.getBitWidth(); I++)
17888      ZeroSplat |= Known.Zero.zext(VT.getSizeInBits())
17889                   << (Known.Zero.getBitWidth() * I);
17890
17891    DefBits = ~(DefBits | ZeroSplat);
17892    if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
17893                                    DefBits, &LHS)) ||
17894        (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
17895                                    DefBits, &LHS)))
17896      return NewOp;
17897
17898    UndefBits = ~(UndefBits | ZeroSplat);
17899    if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
17900                                    UndefBits, &LHS)) ||
17901        (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
17902                                    UndefBits, &LHS)))
17903      return NewOp;
17904  }
17905
17906  return SDValue();
17907}
17908
17909static SDValue performFADDCombine(SDNode *N,
17910                                  TargetLowering::DAGCombinerInfo &DCI) {
17911  SelectionDAG &DAG = DCI.DAG;
17912  SDValue LHS = N->getOperand(0);
17913  SDValue RHS = N->getOperand(1);
17914  EVT VT = N->getValueType(0);
17915  SDLoc DL(N);
17916
17917  if (!N->getFlags().hasAllowReassociation())
17918    return SDValue();
17919
17920  // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
17921  auto ReassocComplex = [&](SDValue A, SDValue B) {
17922    if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
17923      return SDValue();
17924    unsigned Opc = A.getConstantOperandVal(0);
17925    if (Opc != Intrinsic::aarch64_neon_vcmla_rot0 &&
17926        Opc != Intrinsic::aarch64_neon_vcmla_rot90 &&
17927        Opc != Intrinsic::aarch64_neon_vcmla_rot180 &&
17928        Opc != Intrinsic::aarch64_neon_vcmla_rot270)
17929      return SDValue();
17930    SDValue VCMLA = DAG.getNode(
17931        ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0),
17932        DAG.getNode(ISD::FADD, DL, VT, A.getOperand(1), B, N->getFlags()),
17933        A.getOperand(2), A.getOperand(3));
17934    VCMLA->setFlags(A->getFlags());
17935    return VCMLA;
17936  };
17937  if (SDValue R = ReassocComplex(LHS, RHS))
17938    return R;
17939  if (SDValue R = ReassocComplex(RHS, LHS))
17940    return R;
17941
17942  return SDValue();
17943}
17944
17945static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
17946  switch (Opcode) {
17947  case ISD::STRICT_FADD:
17948  case ISD::FADD:
17949    return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
17950  case ISD::ADD:
17951    return VT == MVT::i64;
17952  default:
17953    return false;
17954  }
17955}
17956
17957static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
17958                        AArch64CC::CondCode Cond);
17959
17960static bool isPredicateCCSettingOp(SDValue N) {
17961  if ((N.getOpcode() == ISD::SETCC) ||
17962      (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
17963       (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
17964        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
17965        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
17966        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
17967        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
17968        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
17969        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
17970        N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
17971        // get_active_lane_mask is lowered to a whilelo instruction.
17972        N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask)))
17973    return true;
17974
17975  return false;
17976}
17977
17978// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
17979// ... into: "ptrue p, all" + PTEST
17980static SDValue
17981performFirstTrueTestVectorCombine(SDNode *N,
17982                                  TargetLowering::DAGCombinerInfo &DCI,
17983                                  const AArch64Subtarget *Subtarget) {
17984  assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
17985  // Make sure PTEST can be legalised with illegal types.
17986  if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
17987    return SDValue();
17988
17989  SDValue N0 = N->getOperand(0);
17990  EVT VT = N0.getValueType();
17991
17992  if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
17993      !isNullConstant(N->getOperand(1)))
17994    return SDValue();
17995
17996  // Restricted the DAG combine to only cases where we're extracting from a
17997  // flag-setting operation.
17998  if (!isPredicateCCSettingOp(N0))
17999    return SDValue();
18000
18001  // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
18002  SelectionDAG &DAG = DCI.DAG;
18003  SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);
18004  return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);
18005}
18006
18007// Materialize : Idx = (add (mul vscale, NumEls), -1)
18008//               i1 = extract_vector_elt t37, Constant:i64<Idx>
18009//     ... into: "ptrue p, all" + PTEST
18010static SDValue
18011performLastTrueTestVectorCombine(SDNode *N,
18012                                 TargetLowering::DAGCombinerInfo &DCI,
18013                                 const AArch64Subtarget *Subtarget) {
18014  assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18015  // Make sure PTEST is legal types.
18016  if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
18017    return SDValue();
18018
18019  SDValue N0 = N->getOperand(0);
18020  EVT OpVT = N0.getValueType();
18021
18022  if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
18023    return SDValue();
18024
18025  // Idx == (add (mul vscale, NumEls), -1)
18026  SDValue Idx = N->getOperand(1);
18027  if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(Idx.getOperand(1)))
18028    return SDValue();
18029
18030  SDValue VS = Idx.getOperand(0);
18031  if (VS.getOpcode() != ISD::VSCALE)
18032    return SDValue();
18033
18034  unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
18035  if (VS.getConstantOperandVal(0) != NumEls)
18036    return SDValue();
18037
18038  // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
18039  SelectionDAG &DAG = DCI.DAG;
18040  SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all);
18041  return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::LAST_ACTIVE);
18042}
18043
18044static SDValue
18045performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
18046                               const AArch64Subtarget *Subtarget) {
18047  assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18048  if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
18049    return Res;
18050  if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
18051    return Res;
18052
18053  SelectionDAG &DAG = DCI.DAG;
18054  SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
18055
18056  EVT VT = N->getValueType(0);
18057  const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
18058  bool IsStrict = N0->isStrictFPOpcode();
18059
18060  // extract(dup x) -> x
18061  if (N0.getOpcode() == AArch64ISD::DUP)
18062    return VT.isInteger() ? DAG.getZExtOrTrunc(N0.getOperand(0), SDLoc(N), VT)
18063                          : N0.getOperand(0);
18064
18065  // Rewrite for pairwise fadd pattern
18066  //   (f32 (extract_vector_elt
18067  //           (fadd (vXf32 Other)
18068  //                 (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
18069  // ->
18070  //   (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
18071  //              (extract_vector_elt (vXf32 Other) 1))
18072  // For strict_fadd we need to make sure the old strict_fadd can be deleted, so
18073  // we can only do this when it's used only by the extract_vector_elt.
18074  if (isNullConstant(N1) && hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) &&
18075      (!IsStrict || N0.hasOneUse())) {
18076    SDLoc DL(N0);
18077    SDValue N00 = N0->getOperand(IsStrict ? 1 : 0);
18078    SDValue N01 = N0->getOperand(IsStrict ? 2 : 1);
18079
18080    ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01);
18081    SDValue Other = N00;
18082
18083    // And handle the commutative case.
18084    if (!Shuffle) {
18085      Shuffle = dyn_cast<ShuffleVectorSDNode>(N00);
18086      Other = N01;
18087    }
18088
18089    if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
18090        Other == Shuffle->getOperand(0)) {
18091      SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
18092                                     DAG.getConstant(0, DL, MVT::i64));
18093      SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
18094                                     DAG.getConstant(1, DL, MVT::i64));
18095      if (!IsStrict)
18096        return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2);
18097
18098      // For strict_fadd we need uses of the final extract_vector to be replaced
18099      // with the strict_fadd, but we also need uses of the chain output of the
18100      // original strict_fadd to use the chain output of the new strict_fadd as
18101      // otherwise it may not be deleted.
18102      SDValue Ret = DAG.getNode(N0->getOpcode(), DL,
18103                                {VT, MVT::Other},
18104                                {N0->getOperand(0), Extract1, Extract2});
18105      DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret);
18106      DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1));
18107      return SDValue(N, 0);
18108    }
18109  }
18110
18111  return SDValue();
18112}
18113
18114static SDValue performConcatVectorsCombine(SDNode *N,
18115                                           TargetLowering::DAGCombinerInfo &DCI,
18116                                           SelectionDAG &DAG) {
18117  SDLoc dl(N);
18118  EVT VT = N->getValueType(0);
18119  SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
18120  unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
18121
18122  if (VT.isScalableVector())
18123    return SDValue();
18124
18125  // Optimize concat_vectors of truncated vectors, where the intermediate
18126  // type is illegal, to avoid said illegality,  e.g.,
18127  //   (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
18128  //                          (v2i16 (truncate (v2i64)))))
18129  // ->
18130  //   (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
18131  //                                    (v4i32 (bitcast (v2i64))),
18132  //                                    <0, 2, 4, 6>)))
18133  // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
18134  // on both input and result type, so we might generate worse code.
18135  // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
18136  if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
18137      N1Opc == ISD::TRUNCATE) {
18138    SDValue N00 = N0->getOperand(0);
18139    SDValue N10 = N1->getOperand(0);
18140    EVT N00VT = N00.getValueType();
18141
18142    if (N00VT == N10.getValueType() &&
18143        (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
18144        N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
18145      MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
18146      SmallVector<int, 8> Mask(MidVT.getVectorNumElements());
18147      for (size_t i = 0; i < Mask.size(); ++i)
18148        Mask[i] = i * 2;
18149      return DAG.getNode(ISD::TRUNCATE, dl, VT,
18150                         DAG.getVectorShuffle(
18151                             MidVT, dl,
18152                             DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
18153                             DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
18154    }
18155  }
18156
18157  if (N->getOperand(0).getValueType() == MVT::v4i8) {
18158    // If we have a concat of v4i8 loads, convert them to a buildvector of f32
18159    // loads to prevent having to go through the v4i8 load legalization that
18160    // needs to extend each element into a larger type.
18161    if (N->getNumOperands() % 2 == 0 && all_of(N->op_values(), [](SDValue V) {
18162          if (V.getValueType() != MVT::v4i8)
18163            return false;
18164          if (V.isUndef())
18165            return true;
18166          LoadSDNode *LD = dyn_cast<LoadSDNode>(V);
18167          return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
18168                 LD->getExtensionType() == ISD::NON_EXTLOAD;
18169        })) {
18170      EVT NVT =
18171          EVT::getVectorVT(*DAG.getContext(), MVT::f32, N->getNumOperands());
18172      SmallVector<SDValue> Ops;
18173
18174      for (unsigned i = 0; i < N->getNumOperands(); i++) {
18175        SDValue V = N->getOperand(i);
18176        if (V.isUndef())
18177          Ops.push_back(DAG.getUNDEF(MVT::f32));
18178        else {
18179          LoadSDNode *LD = cast<LoadSDNode>(V);
18180          SDValue NewLoad =
18181              DAG.getLoad(MVT::f32, dl, LD->getChain(), LD->getBasePtr(),
18182                          LD->getMemOperand());
18183          DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
18184          Ops.push_back(NewLoad);
18185        }
18186      }
18187      return DAG.getBitcast(N->getValueType(0),
18188                            DAG.getBuildVector(NVT, dl, Ops));
18189    }
18190  }
18191
18192  // Canonicalise concat_vectors to replace concatenations of truncated nots
18193  // with nots of concatenated truncates. This in some cases allows for multiple
18194  // redundant negations to be eliminated.
18195  //  (concat_vectors (v4i16 (truncate (not (v4i32)))),
18196  //                  (v4i16 (truncate (not (v4i32)))))
18197  // ->
18198  //  (not (concat_vectors (v4i16 (truncate (v4i32))),
18199  //                       (v4i16 (truncate (v4i32)))))
18200  if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
18201      N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N0.getNode()) &&
18202      N->isOnlyUserOf(N1.getNode())) {
18203    auto isBitwiseVectorNegate = [](SDValue V) {
18204      return V->getOpcode() == ISD::XOR &&
18205             ISD::isConstantSplatVectorAllOnes(V.getOperand(1).getNode());
18206    };
18207    SDValue N00 = N0->getOperand(0);
18208    SDValue N10 = N1->getOperand(0);
18209    if (isBitwiseVectorNegate(N00) && N0->isOnlyUserOf(N00.getNode()) &&
18210        isBitwiseVectorNegate(N10) && N1->isOnlyUserOf(N10.getNode())) {
18211      return DAG.getNOT(
18212          dl,
18213          DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
18214                      DAG.getNode(ISD::TRUNCATE, dl, N0.getValueType(),
18215                                  N00->getOperand(0)),
18216                      DAG.getNode(ISD::TRUNCATE, dl, N1.getValueType(),
18217                                  N10->getOperand(0))),
18218          VT);
18219    }
18220  }
18221
18222  // Wait till after everything is legalized to try this. That way we have
18223  // legal vector types and such.
18224  if (DCI.isBeforeLegalizeOps())
18225    return SDValue();
18226
18227  // Optimise concat_vectors of two [us]avgceils or [us]avgfloors that use
18228  // extracted subvectors from the same original vectors. Combine these into a
18229  // single avg that operates on the two original vectors.
18230  // avgceil is the target independant name for rhadd, avgfloor is a hadd.
18231  // Example:
18232  //  (concat_vectors (v8i8 (avgceils (extract_subvector (v16i8 OpA, <0>),
18233  //                                   extract_subvector (v16i8 OpB, <0>))),
18234  //                  (v8i8 (avgceils (extract_subvector (v16i8 OpA, <8>),
18235  //                                   extract_subvector (v16i8 OpB, <8>)))))
18236  // ->
18237  //  (v16i8(avgceils(v16i8 OpA, v16i8 OpB)))
18238  if (N->getNumOperands() == 2 && N0Opc == N1Opc &&
18239      (N0Opc == ISD::AVGCEILU || N0Opc == ISD::AVGCEILS ||
18240       N0Opc == ISD::AVGFLOORU || N0Opc == ISD::AVGFLOORS)) {
18241    SDValue N00 = N0->getOperand(0);
18242    SDValue N01 = N0->getOperand(1);
18243    SDValue N10 = N1->getOperand(0);
18244    SDValue N11 = N1->getOperand(1);
18245
18246    EVT N00VT = N00.getValueType();
18247    EVT N10VT = N10.getValueType();
18248
18249    if (N00->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
18250        N01->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
18251        N10->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
18252        N11->getOpcode() == ISD::EXTRACT_SUBVECTOR && N00VT == N10VT) {
18253      SDValue N00Source = N00->getOperand(0);
18254      SDValue N01Source = N01->getOperand(0);
18255      SDValue N10Source = N10->getOperand(0);
18256      SDValue N11Source = N11->getOperand(0);
18257
18258      if (N00Source == N10Source && N01Source == N11Source &&
18259          N00Source.getValueType() == VT && N01Source.getValueType() == VT) {
18260        assert(N0.getValueType() == N1.getValueType());
18261
18262        uint64_t N00Index = N00.getConstantOperandVal(1);
18263        uint64_t N01Index = N01.getConstantOperandVal(1);
18264        uint64_t N10Index = N10.getConstantOperandVal(1);
18265        uint64_t N11Index = N11.getConstantOperandVal(1);
18266
18267        if (N00Index == N01Index && N10Index == N11Index && N00Index == 0 &&
18268            N10Index == N00VT.getVectorNumElements())
18269          return DAG.getNode(N0Opc, dl, VT, N00Source, N01Source);
18270      }
18271    }
18272  }
18273
18274  auto IsRSHRN = [](SDValue Shr) {
18275    if (Shr.getOpcode() != AArch64ISD::VLSHR)
18276      return false;
18277    SDValue Op = Shr.getOperand(0);
18278    EVT VT = Op.getValueType();
18279    unsigned ShtAmt = Shr.getConstantOperandVal(1);
18280    if (ShtAmt > VT.getScalarSizeInBits() / 2 || Op.getOpcode() != ISD::ADD)
18281      return false;
18282
18283    APInt Imm;
18284    if (Op.getOperand(1).getOpcode() == AArch64ISD::MOVIshift)
18285      Imm = APInt(VT.getScalarSizeInBits(),
18286                  Op.getOperand(1).getConstantOperandVal(0)
18287                      << Op.getOperand(1).getConstantOperandVal(1));
18288    else if (Op.getOperand(1).getOpcode() == AArch64ISD::DUP &&
18289             isa<ConstantSDNode>(Op.getOperand(1).getOperand(0)))
18290      Imm = APInt(VT.getScalarSizeInBits(),
18291                  Op.getOperand(1).getConstantOperandVal(0));
18292    else
18293      return false;
18294
18295    if (Imm != 1ULL << (ShtAmt - 1))
18296      return false;
18297    return true;
18298  };
18299
18300  // concat(rshrn(x), rshrn(y)) -> rshrn(concat(x, y))
18301  if (N->getNumOperands() == 2 && IsRSHRN(N0) &&
18302      ((IsRSHRN(N1) &&
18303        N0.getConstantOperandVal(1) == N1.getConstantOperandVal(1)) ||
18304       N1.isUndef())) {
18305    SDValue X = N0.getOperand(0).getOperand(0);
18306    SDValue Y = N1.isUndef() ? DAG.getUNDEF(X.getValueType())
18307                             : N1.getOperand(0).getOperand(0);
18308    EVT BVT =
18309        X.getValueType().getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
18310    SDValue CC = DAG.getNode(ISD::CONCAT_VECTORS, dl, BVT, X, Y);
18311    SDValue Add = DAG.getNode(
18312        ISD::ADD, dl, BVT, CC,
18313        DAG.getConstant(1ULL << (N0.getConstantOperandVal(1) - 1), dl, BVT));
18314    SDValue Shr =
18315        DAG.getNode(AArch64ISD::VLSHR, dl, BVT, Add, N0.getOperand(1));
18316    return Shr;
18317  }
18318
18319  // concat(zip1(a, b), zip2(a, b)) is zip1(a, b)
18320  if (N->getNumOperands() == 2 && N0Opc == AArch64ISD::ZIP1 &&
18321      N1Opc == AArch64ISD::ZIP2 && N0.getOperand(0) == N1.getOperand(0) &&
18322      N0.getOperand(1) == N1.getOperand(1)) {
18323    SDValue E0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(0),
18324                             DAG.getUNDEF(N0.getValueType()));
18325    SDValue E1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(1),
18326                             DAG.getUNDEF(N0.getValueType()));
18327    return DAG.getNode(AArch64ISD::ZIP1, dl, VT, E0, E1);
18328  }
18329
18330  // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
18331  // splat. The indexed instructions are going to be expecting a DUPLANE64, so
18332  // canonicalise to that.
18333  if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) {
18334    assert(VT.getScalarSizeInBits() == 64);
18335    return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
18336                       DAG.getConstant(0, dl, MVT::i64));
18337  }
18338
18339  // Canonicalise concat_vectors so that the right-hand vector has as few
18340  // bit-casts as possible before its real operation. The primary matching
18341  // destination for these operations will be the narrowing "2" instructions,
18342  // which depend on the operation being performed on this right-hand vector.
18343  // For example,
18344  //    (concat_vectors LHS,  (v1i64 (bitconvert (v4i16 RHS))))
18345  // becomes
18346  //    (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
18347
18348  if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST)
18349    return SDValue();
18350  SDValue RHS = N1->getOperand(0);
18351  MVT RHSTy = RHS.getValueType().getSimpleVT();
18352  // If the RHS is not a vector, this is not the pattern we're looking for.
18353  if (!RHSTy.isVector())
18354    return SDValue();
18355
18356  LLVM_DEBUG(
18357      dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
18358
18359  MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
18360                                  RHSTy.getVectorNumElements() * 2);
18361  return DAG.getNode(ISD::BITCAST, dl, VT,
18362                     DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
18363                                 DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
18364                                 RHS));
18365}
18366
18367static SDValue
18368performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
18369                               SelectionDAG &DAG) {
18370  if (DCI.isBeforeLegalizeOps())
18371    return SDValue();
18372
18373  EVT VT = N->getValueType(0);
18374  if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
18375    return SDValue();
18376
18377  SDValue V = N->getOperand(0);
18378
18379  // NOTE: This combine exists in DAGCombiner, but that version's legality check
18380  // blocks this combine because the non-const case requires custom lowering.
18381  //
18382  // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
18383  if (V.getOpcode() == ISD::SPLAT_VECTOR)
18384    if (isa<ConstantSDNode>(V.getOperand(0)))
18385      return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V.getOperand(0));
18386
18387  return SDValue();
18388}
18389
18390static SDValue
18391performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
18392                              SelectionDAG &DAG) {
18393  SDLoc DL(N);
18394  SDValue Vec = N->getOperand(0);
18395  SDValue SubVec = N->getOperand(1);
18396  uint64_t IdxVal = N->getConstantOperandVal(2);
18397  EVT VecVT = Vec.getValueType();
18398  EVT SubVT = SubVec.getValueType();
18399
18400  // Only do this for legal fixed vector types.
18401  if (!VecVT.isFixedLengthVector() ||
18402      !DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
18403      !DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
18404    return SDValue();
18405
18406  // Ignore widening patterns.
18407  if (IdxVal == 0 && Vec.isUndef())
18408    return SDValue();
18409
18410  // Subvector must be half the width and an "aligned" insertion.
18411  unsigned NumSubElts = SubVT.getVectorNumElements();
18412  if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
18413      (IdxVal != 0 && IdxVal != NumSubElts))
18414    return SDValue();
18415
18416  // Fold insert_subvector -> concat_vectors
18417  // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
18418  // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
18419  SDValue Lo, Hi;
18420  if (IdxVal == 0) {
18421    Lo = SubVec;
18422    Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
18423                     DAG.getVectorIdxConstant(NumSubElts, DL));
18424  } else {
18425    Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
18426                     DAG.getVectorIdxConstant(0, DL));
18427    Hi = SubVec;
18428  }
18429  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
18430}
18431
18432static SDValue tryCombineFixedPointConvert(SDNode *N,
18433                                           TargetLowering::DAGCombinerInfo &DCI,
18434                                           SelectionDAG &DAG) {
18435  // Wait until after everything is legalized to try this. That way we have
18436  // legal vector types and such.
18437  if (DCI.isBeforeLegalizeOps())
18438    return SDValue();
18439  // Transform a scalar conversion of a value from a lane extract into a
18440  // lane extract of a vector conversion. E.g., from foo1 to foo2:
18441  // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
18442  // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
18443  //
18444  // The second form interacts better with instruction selection and the
18445  // register allocator to avoid cross-class register copies that aren't
18446  // coalescable due to a lane reference.
18447
18448  // Check the operand and see if it originates from a lane extract.
18449  SDValue Op1 = N->getOperand(1);
18450  if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
18451    return SDValue();
18452
18453  // Yep, no additional predication needed. Perform the transform.
18454  SDValue IID = N->getOperand(0);
18455  SDValue Shift = N->getOperand(2);
18456  SDValue Vec = Op1.getOperand(0);
18457  SDValue Lane = Op1.getOperand(1);
18458  EVT ResTy = N->getValueType(0);
18459  EVT VecResTy;
18460  SDLoc DL(N);
18461
18462  // The vector width should be 128 bits by the time we get here, even
18463  // if it started as 64 bits (the extract_vector handling will have
18464  // done so). Bail if it is not.
18465  if (Vec.getValueSizeInBits() != 128)
18466    return SDValue();
18467
18468  if (Vec.getValueType() == MVT::v4i32)
18469    VecResTy = MVT::v4f32;
18470  else if (Vec.getValueType() == MVT::v2i64)
18471    VecResTy = MVT::v2f64;
18472  else
18473    return SDValue();
18474
18475  SDValue Convert =
18476      DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
18477  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
18478}
18479
18480// AArch64 high-vector "long" operations are formed by performing the non-high
18481// version on an extract_subvector of each operand which gets the high half:
18482//
18483//  (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
18484//
18485// However, there are cases which don't have an extract_high explicitly, but
18486// have another operation that can be made compatible with one for free. For
18487// example:
18488//
18489//  (dupv64 scalar) --> (extract_high (dup128 scalar))
18490//
18491// This routine does the actual conversion of such DUPs, once outer routines
18492// have determined that everything else is in order.
18493// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
18494// similarly here.
18495static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
18496  MVT VT = N.getSimpleValueType();
18497  if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
18498      N.getConstantOperandVal(1) == 0)
18499    N = N.getOperand(0);
18500
18501  switch (N.getOpcode()) {
18502  case AArch64ISD::DUP:
18503  case AArch64ISD::DUPLANE8:
18504  case AArch64ISD::DUPLANE16:
18505  case AArch64ISD::DUPLANE32:
18506  case AArch64ISD::DUPLANE64:
18507  case AArch64ISD::MOVI:
18508  case AArch64ISD::MOVIshift:
18509  case AArch64ISD::MOVIedit:
18510  case AArch64ISD::MOVImsl:
18511  case AArch64ISD::MVNIshift:
18512  case AArch64ISD::MVNImsl:
18513    break;
18514  default:
18515    // FMOV could be supported, but isn't very useful, as it would only occur
18516    // if you passed a bitcast' floating point immediate to an eligible long
18517    // integer op (addl, smull, ...).
18518    return SDValue();
18519  }
18520
18521  if (!VT.is64BitVector())
18522    return SDValue();
18523
18524  SDLoc DL(N);
18525  unsigned NumElems = VT.getVectorNumElements();
18526  if (N.getValueType().is64BitVector()) {
18527    MVT ElementTy = VT.getVectorElementType();
18528    MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
18529    N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops());
18530  }
18531
18532  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N,
18533                     DAG.getConstant(NumElems, DL, MVT::i64));
18534}
18535
18536static bool isEssentiallyExtractHighSubvector(SDValue N) {
18537  if (N.getOpcode() == ISD::BITCAST)
18538    N = N.getOperand(0);
18539  if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
18540    return false;
18541  if (N.getOperand(0).getValueType().isScalableVector())
18542    return false;
18543  return N.getConstantOperandAPInt(1) ==
18544         N.getOperand(0).getValueType().getVectorNumElements() / 2;
18545}
18546
18547/// Helper structure to keep track of ISD::SET_CC operands.
18548struct GenericSetCCInfo {
18549  const SDValue *Opnd0;
18550  const SDValue *Opnd1;
18551  ISD::CondCode CC;
18552};
18553
18554/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
18555struct AArch64SetCCInfo {
18556  const SDValue *Cmp;
18557  AArch64CC::CondCode CC;
18558};
18559
18560/// Helper structure to keep track of SetCC information.
18561union SetCCInfo {
18562  GenericSetCCInfo Generic;
18563  AArch64SetCCInfo AArch64;
18564};
18565
18566/// Helper structure to be able to read SetCC information.  If set to
18567/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
18568/// GenericSetCCInfo.
18569struct SetCCInfoAndKind {
18570  SetCCInfo Info;
18571  bool IsAArch64;
18572};
18573
18574/// Check whether or not \p Op is a SET_CC operation, either a generic or
18575/// an
18576/// AArch64 lowered one.
18577/// \p SetCCInfo is filled accordingly.
18578/// \post SetCCInfo is meanginfull only when this function returns true.
18579/// \return True when Op is a kind of SET_CC operation.
18580static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
18581  // If this is a setcc, this is straight forward.
18582  if (Op.getOpcode() == ISD::SETCC) {
18583    SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
18584    SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
18585    SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
18586    SetCCInfo.IsAArch64 = false;
18587    return true;
18588  }
18589  // Otherwise, check if this is a matching csel instruction.
18590  // In other words:
18591  // - csel 1, 0, cc
18592  // - csel 0, 1, !cc
18593  if (Op.getOpcode() != AArch64ISD::CSEL)
18594    return false;
18595  // Set the information about the operands.
18596  // TODO: we want the operands of the Cmp not the csel
18597  SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
18598  SetCCInfo.IsAArch64 = true;
18599  SetCCInfo.Info.AArch64.CC =
18600      static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
18601
18602  // Check that the operands matches the constraints:
18603  // (1) Both operands must be constants.
18604  // (2) One must be 1 and the other must be 0.
18605  ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
18606  ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
18607
18608  // Check (1).
18609  if (!TValue || !FValue)
18610    return false;
18611
18612  // Check (2).
18613  if (!TValue->isOne()) {
18614    // Update the comparison when we are interested in !cc.
18615    std::swap(TValue, FValue);
18616    SetCCInfo.Info.AArch64.CC =
18617        AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC);
18618  }
18619  return TValue->isOne() && FValue->isZero();
18620}
18621
18622// Returns true if Op is setcc or zext of setcc.
18623static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
18624  if (isSetCC(Op, Info))
18625    return true;
18626  return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
18627    isSetCC(Op->getOperand(0), Info));
18628}
18629
18630// The folding we want to perform is:
18631// (add x, [zext] (setcc cc ...) )
18632//   -->
18633// (csel x, (add x, 1), !cc ...)
18634//
18635// The latter will get matched to a CSINC instruction.
18636static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
18637  assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
18638  SDValue LHS = Op->getOperand(0);
18639  SDValue RHS = Op->getOperand(1);
18640  SetCCInfoAndKind InfoAndKind;
18641
18642  // If both operands are a SET_CC, then we don't want to perform this
18643  // folding and create another csel as this results in more instructions
18644  // (and higher register usage).
18645  if (isSetCCOrZExtSetCC(LHS, InfoAndKind) &&
18646      isSetCCOrZExtSetCC(RHS, InfoAndKind))
18647    return SDValue();
18648
18649  // If neither operand is a SET_CC, give up.
18650  if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
18651    std::swap(LHS, RHS);
18652    if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
18653      return SDValue();
18654  }
18655
18656  // FIXME: This could be generatized to work for FP comparisons.
18657  EVT CmpVT = InfoAndKind.IsAArch64
18658                  ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
18659                  : InfoAndKind.Info.Generic.Opnd0->getValueType();
18660  if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
18661    return SDValue();
18662
18663  SDValue CCVal;
18664  SDValue Cmp;
18665  SDLoc dl(Op);
18666  if (InfoAndKind.IsAArch64) {
18667    CCVal = DAG.getConstant(
18668        AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl,
18669        MVT::i32);
18670    Cmp = *InfoAndKind.Info.AArch64.Cmp;
18671  } else
18672    Cmp = getAArch64Cmp(
18673        *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1,
18674        ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,
18675        dl);
18676
18677  EVT VT = Op->getValueType(0);
18678  LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
18679  return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
18680}
18681
18682// ADD(UADDV a, UADDV b) -->  UADDV(ADD a, b)
18683static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG) {
18684  EVT VT = N->getValueType(0);
18685  // Only scalar integer and vector types.
18686  if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
18687    return SDValue();
18688
18689  SDValue LHS = N->getOperand(0);
18690  SDValue RHS = N->getOperand(1);
18691  if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
18692      RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
18693    return SDValue();
18694
18695  auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
18696  auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
18697  if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isZero())
18698    return SDValue();
18699
18700  SDValue Op1 = LHS->getOperand(0);
18701  SDValue Op2 = RHS->getOperand(0);
18702  EVT OpVT1 = Op1.getValueType();
18703  EVT OpVT2 = Op2.getValueType();
18704  if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
18705      Op2.getOpcode() != AArch64ISD::UADDV ||
18706      OpVT1.getVectorElementType() != VT)
18707    return SDValue();
18708
18709  SDValue Val1 = Op1.getOperand(0);
18710  SDValue Val2 = Op2.getOperand(0);
18711  EVT ValVT = Val1->getValueType(0);
18712  SDLoc DL(N);
18713  SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);
18714  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
18715                     DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
18716                     DAG.getConstant(0, DL, MVT::i64));
18717}
18718
18719/// Perform the scalar expression combine in the form of:
18720///   CSEL(c, 1, cc) + b => CSINC(b+c, b, cc)
18721///   CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc)
18722static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG) {
18723  EVT VT = N->getValueType(0);
18724  if (!VT.isScalarInteger() || N->getOpcode() != ISD::ADD)
18725    return SDValue();
18726
18727  SDValue LHS = N->getOperand(0);
18728  SDValue RHS = N->getOperand(1);
18729
18730  // Handle commutivity.
18731  if (LHS.getOpcode() != AArch64ISD::CSEL &&
18732      LHS.getOpcode() != AArch64ISD::CSNEG) {
18733    std::swap(LHS, RHS);
18734    if (LHS.getOpcode() != AArch64ISD::CSEL &&
18735        LHS.getOpcode() != AArch64ISD::CSNEG) {
18736      return SDValue();
18737    }
18738  }
18739
18740  if (!LHS.hasOneUse())
18741    return SDValue();
18742
18743  AArch64CC::CondCode AArch64CC =
18744      static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
18745
18746  // The CSEL should include a const one operand, and the CSNEG should include
18747  // One or NegOne operand.
18748  ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(LHS.getOperand(0));
18749  ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
18750  if (!CTVal || !CFVal)
18751    return SDValue();
18752
18753  if (!(LHS.getOpcode() == AArch64ISD::CSEL &&
18754        (CTVal->isOne() || CFVal->isOne())) &&
18755      !(LHS.getOpcode() == AArch64ISD::CSNEG &&
18756        (CTVal->isOne() || CFVal->isAllOnes())))
18757    return SDValue();
18758
18759  // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc)
18760  if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() &&
18761      !CFVal->isOne()) {
18762    std::swap(CTVal, CFVal);
18763    AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
18764  }
18765
18766  SDLoc DL(N);
18767  // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc)
18768  if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() &&
18769      !CFVal->isAllOnes()) {
18770    APInt C = -1 * CFVal->getAPIntValue();
18771    CTVal = cast<ConstantSDNode>(DAG.getConstant(C, DL, VT));
18772    CFVal = cast<ConstantSDNode>(DAG.getAllOnesConstant(DL, VT));
18773    AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
18774  }
18775
18776  // It might be neutral for larger constants, as the immediate need to be
18777  // materialized in a register.
18778  APInt ADDC = CTVal->getAPIntValue();
18779  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18780  if (!TLI.isLegalAddImmediate(ADDC.getSExtValue()))
18781    return SDValue();
18782
18783  assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) ||
18784          (LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) &&
18785         "Unexpected constant value");
18786
18787  SDValue NewNode = DAG.getNode(ISD::ADD, DL, VT, RHS, SDValue(CTVal, 0));
18788  SDValue CCVal = DAG.getConstant(AArch64CC, DL, MVT::i32);
18789  SDValue Cmp = LHS.getOperand(3);
18790
18791  return DAG.getNode(AArch64ISD::CSINC, DL, VT, NewNode, RHS, CCVal, Cmp);
18792}
18793
18794// ADD(UDOT(zero, x, y), A) -->  UDOT(A, x, y)
18795static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG) {
18796  EVT VT = N->getValueType(0);
18797  if (N->getOpcode() != ISD::ADD)
18798    return SDValue();
18799
18800  SDValue Dot = N->getOperand(0);
18801  SDValue A = N->getOperand(1);
18802  // Handle commutivity
18803  auto isZeroDot = [](SDValue Dot) {
18804    return (Dot.getOpcode() == AArch64ISD::UDOT ||
18805            Dot.getOpcode() == AArch64ISD::SDOT) &&
18806           isZerosVector(Dot.getOperand(0).getNode());
18807  };
18808  if (!isZeroDot(Dot))
18809    std::swap(Dot, A);
18810  if (!isZeroDot(Dot))
18811    return SDValue();
18812
18813  return DAG.getNode(Dot.getOpcode(), SDLoc(N), VT, A, Dot.getOperand(1),
18814                     Dot.getOperand(2));
18815}
18816
18817static bool isNegatedInteger(SDValue Op) {
18818  return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
18819}
18820
18821static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG) {
18822  SDLoc DL(Op);
18823  EVT VT = Op.getValueType();
18824  SDValue Zero = DAG.getConstant(0, DL, VT);
18825  return DAG.getNode(ISD::SUB, DL, VT, Zero, Op);
18826}
18827
18828// Try to fold
18829//
18830// (neg (csel X, Y)) -> (csel (neg X), (neg Y))
18831//
18832// The folding helps csel to be matched with csneg without generating
18833// redundant neg instruction, which includes negation of the csel expansion
18834// of abs node lowered by lowerABS.
18835static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG) {
18836  if (!isNegatedInteger(SDValue(N, 0)))
18837    return SDValue();
18838
18839  SDValue CSel = N->getOperand(1);
18840  if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse())
18841    return SDValue();
18842
18843  SDValue N0 = CSel.getOperand(0);
18844  SDValue N1 = CSel.getOperand(1);
18845
18846  // If both of them is not negations, it's not worth the folding as it
18847  // introduces two additional negations while reducing one negation.
18848  if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
18849    return SDValue();
18850
18851  SDValue N0N = getNegatedInteger(N0, DAG);
18852  SDValue N1N = getNegatedInteger(N1, DAG);
18853
18854  SDLoc DL(N);
18855  EVT VT = CSel.getValueType();
18856  return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2),
18857                     CSel.getOperand(3));
18858}
18859
18860// The basic add/sub long vector instructions have variants with "2" on the end
18861// which act on the high-half of their inputs. They are normally matched by
18862// patterns like:
18863//
18864// (add (zeroext (extract_high LHS)),
18865//      (zeroext (extract_high RHS)))
18866// -> uaddl2 vD, vN, vM
18867//
18868// However, if one of the extracts is something like a duplicate, this
18869// instruction can still be used profitably. This function puts the DAG into a
18870// more appropriate form for those patterns to trigger.
18871static SDValue performAddSubLongCombine(SDNode *N,
18872                                        TargetLowering::DAGCombinerInfo &DCI) {
18873  SelectionDAG &DAG = DCI.DAG;
18874  if (DCI.isBeforeLegalizeOps())
18875    return SDValue();
18876
18877  MVT VT = N->getSimpleValueType(0);
18878  if (!VT.is128BitVector()) {
18879    if (N->getOpcode() == ISD::ADD)
18880      return performSetccAddFolding(N, DAG);
18881    return SDValue();
18882  }
18883
18884  // Make sure both branches are extended in the same way.
18885  SDValue LHS = N->getOperand(0);
18886  SDValue RHS = N->getOperand(1);
18887  if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
18888       LHS.getOpcode() != ISD::SIGN_EXTEND) ||
18889      LHS.getOpcode() != RHS.getOpcode())
18890    return SDValue();
18891
18892  unsigned ExtType = LHS.getOpcode();
18893
18894  // It's not worth doing if at least one of the inputs isn't already an
18895  // extract, but we don't know which it'll be so we have to try both.
18896  if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
18897    RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
18898    if (!RHS.getNode())
18899      return SDValue();
18900
18901    RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
18902  } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
18903    LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
18904    if (!LHS.getNode())
18905      return SDValue();
18906
18907    LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
18908  }
18909
18910  return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
18911}
18912
18913static bool isCMP(SDValue Op) {
18914  return Op.getOpcode() == AArch64ISD::SUBS &&
18915         !Op.getNode()->hasAnyUseOfValue(0);
18916}
18917
18918// (CSEL 1 0 CC Cond) => CC
18919// (CSEL 0 1 CC Cond) => !CC
18920static std::optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) {
18921  if (Op.getOpcode() != AArch64ISD::CSEL)
18922    return std::nullopt;
18923  auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
18924  if (CC == AArch64CC::AL || CC == AArch64CC::NV)
18925    return std::nullopt;
18926  SDValue OpLHS = Op.getOperand(0);
18927  SDValue OpRHS = Op.getOperand(1);
18928  if (isOneConstant(OpLHS) && isNullConstant(OpRHS))
18929    return CC;
18930  if (isNullConstant(OpLHS) && isOneConstant(OpRHS))
18931    return getInvertedCondCode(CC);
18932
18933  return std::nullopt;
18934}
18935
18936// (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry)
18937// (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry)
18938static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd) {
18939  SDValue CmpOp = Op->getOperand(2);
18940  if (!isCMP(CmpOp))
18941    return SDValue();
18942
18943  if (IsAdd) {
18944    if (!isOneConstant(CmpOp.getOperand(1)))
18945      return SDValue();
18946  } else {
18947    if (!isNullConstant(CmpOp.getOperand(0)))
18948      return SDValue();
18949  }
18950
18951  SDValue CsetOp = CmpOp->getOperand(IsAdd ? 0 : 1);
18952  auto CC = getCSETCondCode(CsetOp);
18953  if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO))
18954    return SDValue();
18955
18956  return DAG.getNode(Op->getOpcode(), SDLoc(Op), Op->getVTList(),
18957                     Op->getOperand(0), Op->getOperand(1),
18958                     CsetOp.getOperand(3));
18959}
18960
18961// (ADC x 0 cond) => (CINC x HS cond)
18962static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG) {
18963  SDValue LHS = N->getOperand(0);
18964  SDValue RHS = N->getOperand(1);
18965  SDValue Cond = N->getOperand(2);
18966
18967  if (!isNullConstant(RHS))
18968    return SDValue();
18969
18970  EVT VT = N->getValueType(0);
18971  SDLoc DL(N);
18972
18973  // (CINC x cc cond) <=> (CSINC x x !cc cond)
18974  SDValue CC = DAG.getConstant(AArch64CC::LO, DL, MVT::i32);
18975  return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond);
18976}
18977
18978// Transform vector add(zext i8 to i32, zext i8 to i32)
18979//  into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
18980// This allows extra uses of saddl/uaddl at the lower vector widths, and less
18981// extends.
18982static SDValue performVectorAddSubExtCombine(SDNode *N, SelectionDAG &DAG) {
18983  EVT VT = N->getValueType(0);
18984  if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
18985      (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
18986       N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
18987      (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
18988       N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
18989      N->getOperand(0).getOperand(0).getValueType() !=
18990          N->getOperand(1).getOperand(0).getValueType())
18991    return SDValue();
18992
18993  SDValue N0 = N->getOperand(0).getOperand(0);
18994  SDValue N1 = N->getOperand(1).getOperand(0);
18995  EVT InVT = N0.getValueType();
18996
18997  EVT S1 = InVT.getScalarType();
18998  EVT S2 = VT.getScalarType();
18999  if ((S2 == MVT::i32 && S1 == MVT::i8) ||
19000      (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
19001    SDLoc DL(N);
19002    EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
19003                                  S2.getHalfSizedIntegerVT(*DAG.getContext()),
19004                                  VT.getVectorElementCount());
19005    SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
19006    SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
19007    SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
19008    return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NewOp);
19009  }
19010  return SDValue();
19011}
19012
19013static SDValue performBuildVectorCombine(SDNode *N,
19014                                         TargetLowering::DAGCombinerInfo &DCI,
19015                                         SelectionDAG &DAG) {
19016  SDLoc DL(N);
19017  EVT VT = N->getValueType(0);
19018
19019  // A build vector of two extracted elements is equivalent to an
19020  // extract subvector where the inner vector is any-extended to the
19021  // extract_vector_elt VT.
19022  //    (build_vector (extract_elt_iXX_to_i32 vec Idx+0)
19023  //                  (extract_elt_iXX_to_i32 vec Idx+1))
19024  // => (extract_subvector (anyext_iXX_to_i32 vec) Idx)
19025
19026  // For now, only consider the v2i32 case, which arises as a result of
19027  // legalization.
19028  if (VT != MVT::v2i32)
19029    return SDValue();
19030
19031  SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
19032  // Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT.
19033  if (Elt0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19034      Elt1->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19035      // Constant index.
19036      isa<ConstantSDNode>(Elt0->getOperand(1)) &&
19037      isa<ConstantSDNode>(Elt1->getOperand(1)) &&
19038      // Both EXTRACT_VECTOR_ELT from same vector...
19039      Elt0->getOperand(0) == Elt1->getOperand(0) &&
19040      // ... and contiguous. First element's index +1 == second element's index.
19041      Elt0->getConstantOperandVal(1) + 1 == Elt1->getConstantOperandVal(1) &&
19042      // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
19043      // ResultType's known minimum vector length.
19044      Elt0->getConstantOperandVal(1) % VT.getVectorMinNumElements() == 0) {
19045    SDValue VecToExtend = Elt0->getOperand(0);
19046    EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(MVT::i32);
19047    if (!DAG.getTargetLoweringInfo().isTypeLegal(ExtVT))
19048      return SDValue();
19049
19050    SDValue SubvectorIdx = DAG.getVectorIdxConstant(Elt0->getConstantOperandVal(1), DL);
19051
19052    SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, DL, ExtVT, VecToExtend);
19053    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Ext,
19054                       SubvectorIdx);
19055  }
19056
19057  return SDValue();
19058}
19059
19060static SDValue performTruncateCombine(SDNode *N,
19061                                      SelectionDAG &DAG) {
19062  EVT VT = N->getValueType(0);
19063  SDValue N0 = N->getOperand(0);
19064  if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
19065      N0.getOpcode() == AArch64ISD::DUP) {
19066    SDValue Op = N0.getOperand(0);
19067    if (VT.getScalarType() == MVT::i32 &&
19068        N0.getOperand(0).getValueType().getScalarType() == MVT::i64)
19069      Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i32, Op);
19070    return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Op);
19071  }
19072
19073  return SDValue();
19074}
19075
19076// Check an node is an extend or shift operand
19077static bool isExtendOrShiftOperand(SDValue N) {
19078  unsigned Opcode = N.getOpcode();
19079  if (ISD::isExtOpcode(Opcode) || Opcode == ISD::SIGN_EXTEND_INREG) {
19080    EVT SrcVT;
19081    if (Opcode == ISD::SIGN_EXTEND_INREG)
19082      SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
19083    else
19084      SrcVT = N.getOperand(0).getValueType();
19085
19086    return SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8;
19087  } else if (Opcode == ISD::AND) {
19088    ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
19089    if (!CSD)
19090      return false;
19091    uint64_t AndMask = CSD->getZExtValue();
19092    return AndMask == 0xff || AndMask == 0xffff || AndMask == 0xffffffff;
19093  } else if (Opcode == ISD::SHL || Opcode == ISD::SRL || Opcode == ISD::SRA) {
19094    return isa<ConstantSDNode>(N.getOperand(1));
19095  }
19096
19097  return false;
19098}
19099
19100// (N - Y) + Z --> (Z - Y) + N
19101// when N is an extend or shift operand
19102static SDValue performAddCombineSubShift(SDNode *N, SDValue SUB, SDValue Z,
19103                                         SelectionDAG &DAG) {
19104  auto IsOneUseExtend = [](SDValue N) {
19105    return N.hasOneUse() && isExtendOrShiftOperand(N);
19106  };
19107
19108  // DAGCombiner will revert the combination when Z is constant cause
19109  // dead loop. So don't enable the combination when Z is constant.
19110  // If Z is one use shift C, we also can't do the optimization.
19111  // It will falling to self infinite loop.
19112  if (isa<ConstantSDNode>(Z) || IsOneUseExtend(Z))
19113    return SDValue();
19114
19115  if (SUB.getOpcode() != ISD::SUB || !SUB.hasOneUse())
19116    return SDValue();
19117
19118  SDValue Shift = SUB.getOperand(0);
19119  if (!IsOneUseExtend(Shift))
19120    return SDValue();
19121
19122  SDLoc DL(N);
19123  EVT VT = N->getValueType(0);
19124
19125  SDValue Y = SUB.getOperand(1);
19126  SDValue NewSub = DAG.getNode(ISD::SUB, DL, VT, Z, Y);
19127  return DAG.getNode(ISD::ADD, DL, VT, NewSub, Shift);
19128}
19129
19130static SDValue performAddCombineForShiftedOperands(SDNode *N,
19131                                                   SelectionDAG &DAG) {
19132  // NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not
19133  // commutative.
19134  if (N->getOpcode() != ISD::ADD)
19135    return SDValue();
19136
19137  // Bail out when value type is not one of {i32, i64}, since AArch64 ADD with
19138  // shifted register is only available for i32 and i64.
19139  EVT VT = N->getValueType(0);
19140  if (VT != MVT::i32 && VT != MVT::i64)
19141    return SDValue();
19142
19143  SDLoc DL(N);
19144  SDValue LHS = N->getOperand(0);
19145  SDValue RHS = N->getOperand(1);
19146
19147  if (SDValue Val = performAddCombineSubShift(N, LHS, RHS, DAG))
19148    return Val;
19149  if (SDValue Val = performAddCombineSubShift(N, RHS, LHS, DAG))
19150    return Val;
19151
19152  uint64_t LHSImm = 0, RHSImm = 0;
19153  // If both operand are shifted by imm and shift amount is not greater than 4
19154  // for one operand, swap LHS and RHS to put operand with smaller shift amount
19155  // on RHS.
19156  //
19157  // On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with
19158  // LSL shift (shift <= 4) has smaller latency and larger throughput than ADD
19159  // with LSL (shift > 4). For the rest of processors, this is no-op for
19160  // performance or correctness.
19161  if (isOpcWithIntImmediate(LHS.getNode(), ISD::SHL, LHSImm) &&
19162      isOpcWithIntImmediate(RHS.getNode(), ISD::SHL, RHSImm) && LHSImm <= 4 &&
19163      RHSImm > 4 && LHS.hasOneUse())
19164    return DAG.getNode(ISD::ADD, DL, VT, RHS, LHS);
19165
19166  return SDValue();
19167}
19168
19169// The mid end will reassociate sub(sub(x, m1), m2) to sub(x, add(m1, m2))
19170// This reassociates it back to allow the creation of more mls instructions.
19171static SDValue performSubAddMULCombine(SDNode *N, SelectionDAG &DAG) {
19172  if (N->getOpcode() != ISD::SUB)
19173    return SDValue();
19174
19175  SDValue Add = N->getOperand(1);
19176  SDValue X = N->getOperand(0);
19177  if (Add.getOpcode() != ISD::ADD)
19178    return SDValue();
19179
19180  if (!Add.hasOneUse())
19181    return SDValue();
19182  if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(X)))
19183    return SDValue();
19184
19185  SDValue M1 = Add.getOperand(0);
19186  SDValue M2 = Add.getOperand(1);
19187  if (M1.getOpcode() != ISD::MUL && M1.getOpcode() != AArch64ISD::SMULL &&
19188      M1.getOpcode() != AArch64ISD::UMULL)
19189    return SDValue();
19190  if (M2.getOpcode() != ISD::MUL && M2.getOpcode() != AArch64ISD::SMULL &&
19191      M2.getOpcode() != AArch64ISD::UMULL)
19192    return SDValue();
19193
19194  EVT VT = N->getValueType(0);
19195  SDValue Sub = DAG.getNode(ISD::SUB, SDLoc(N), VT, X, M1);
19196  return DAG.getNode(ISD::SUB, SDLoc(N), VT, Sub, M2);
19197}
19198
19199// Combine into mla/mls.
19200// This works on the patterns of:
19201//   add v1, (mul v2, v3)
19202//   sub v1, (mul v2, v3)
19203// for vectors of type <1 x i64> and <2 x i64> when SVE is available.
19204// It will transform the add/sub to a scalable version, so that we can
19205// make use of SVE's MLA/MLS that will be generated for that pattern
19206static SDValue
19207performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
19208  SelectionDAG &DAG = DCI.DAG;
19209  // Make sure that the types are legal
19210  if (!DCI.isAfterLegalizeDAG())
19211    return SDValue();
19212  // Before using SVE's features, check first if it's available.
19213  if (!DAG.getSubtarget<AArch64Subtarget>().hasSVE())
19214    return SDValue();
19215
19216  if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::SUB)
19217    return SDValue();
19218
19219  if (!N->getValueType(0).isFixedLengthVector())
19220    return SDValue();
19221
19222  auto performOpt = [&DAG, &N](SDValue Op0, SDValue Op1) -> SDValue {
19223    if (Op1.getOpcode() != ISD::EXTRACT_SUBVECTOR)
19224      return SDValue();
19225
19226    if (!cast<ConstantSDNode>(Op1->getOperand(1))->isZero())
19227      return SDValue();
19228
19229    SDValue MulValue = Op1->getOperand(0);
19230    if (MulValue.getOpcode() != AArch64ISD::MUL_PRED)
19231      return SDValue();
19232
19233    if (!Op1.hasOneUse() || !MulValue.hasOneUse())
19234      return SDValue();
19235
19236    EVT ScalableVT = MulValue.getValueType();
19237    if (!ScalableVT.isScalableVector())
19238      return SDValue();
19239
19240    SDValue ScaledOp = convertToScalableVector(DAG, ScalableVT, Op0);
19241    SDValue NewValue =
19242        DAG.getNode(N->getOpcode(), SDLoc(N), ScalableVT, {ScaledOp, MulValue});
19243    return convertFromScalableVector(DAG, N->getValueType(0), NewValue);
19244  };
19245
19246  if (SDValue res = performOpt(N->getOperand(0), N->getOperand(1)))
19247    return res;
19248  else if (N->getOpcode() == ISD::ADD)
19249    return performOpt(N->getOperand(1), N->getOperand(0));
19250
19251  return SDValue();
19252}
19253
19254// Given a i64 add from a v1i64 extract, convert to a neon v1i64 add. This can
19255// help, for example, to produce ssra from sshr+add.
19256static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG) {
19257  EVT VT = N->getValueType(0);
19258  if (VT != MVT::i64)
19259    return SDValue();
19260  SDValue Op0 = N->getOperand(0);
19261  SDValue Op1 = N->getOperand(1);
19262
19263  // At least one of the operands should be an extract, and the other should be
19264  // something that is easy to convert to v1i64 type (in this case a load).
19265  if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
19266      Op0.getOpcode() != ISD::LOAD)
19267    return SDValue();
19268  if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
19269      Op1.getOpcode() != ISD::LOAD)
19270    return SDValue();
19271
19272  SDLoc DL(N);
19273  if (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19274      Op0.getOperand(0).getValueType() == MVT::v1i64) {
19275    Op0 = Op0.getOperand(0);
19276    Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op1);
19277  } else if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19278             Op1.getOperand(0).getValueType() == MVT::v1i64) {
19279    Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op0);
19280    Op1 = Op1.getOperand(0);
19281  } else
19282    return SDValue();
19283
19284  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
19285                     DAG.getNode(N->getOpcode(), DL, MVT::v1i64, Op0, Op1),
19286                     DAG.getConstant(0, DL, MVT::i64));
19287}
19288
19289static bool isLoadOrMultipleLoads(SDValue B, SmallVector<LoadSDNode *> &Loads) {
19290  SDValue BV = peekThroughOneUseBitcasts(B);
19291  if (!BV->hasOneUse())
19292    return false;
19293  if (auto *Ld = dyn_cast<LoadSDNode>(BV)) {
19294    if (!Ld || !Ld->isSimple())
19295      return false;
19296    Loads.push_back(Ld);
19297    return true;
19298  } else if (BV.getOpcode() == ISD::BUILD_VECTOR ||
19299             BV.getOpcode() == ISD::CONCAT_VECTORS) {
19300    for (unsigned Op = 0; Op < BV.getNumOperands(); Op++) {
19301      auto *Ld = dyn_cast<LoadSDNode>(BV.getOperand(Op));
19302      if (!Ld || !Ld->isSimple() || !BV.getOperand(Op).hasOneUse())
19303        return false;
19304      Loads.push_back(Ld);
19305    }
19306    return true;
19307  } else if (B.getOpcode() == ISD::VECTOR_SHUFFLE) {
19308    // Try to find a tree of shuffles and concats from how IR shuffles of loads
19309    // are lowered. Note that this only comes up because we do not always visit
19310    // operands before uses. After that is fixed this can be removed and in the
19311    // meantime this is fairly specific to the lowering we expect from IR.
19312    // t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44, t45
19313    //   t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42, t43
19314    //     t42: v16i8 = concat_vectors t40, t36, undef:v4i8, undef:v4i8
19315    //       t40: v4i8,ch = load<(load (s32) from %ir.17)> t0, t22, undef:i64
19316    //       t36: v4i8,ch = load<(load (s32) from %ir.13)> t0, t18, undef:i64
19317    //     t43: v16i8 = concat_vectors t32, undef:v4i8, undef:v4i8, undef:v4i8
19318    //       t32: v4i8,ch = load<(load (s32) from %ir.9)> t0, t14, undef:i64
19319    //   t45: v16i8 = concat_vectors t28, undef:v4i8, undef:v4i8, undef:v4i8
19320    //     t28: v4i8,ch = load<(load (s32) from %ir.0)> t0, t2, undef:i64
19321    if (B.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE ||
19322        B.getOperand(0).getOperand(0).getOpcode() != ISD::CONCAT_VECTORS ||
19323        B.getOperand(0).getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
19324        B.getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
19325        B.getOperand(1).getNumOperands() != 4)
19326      return false;
19327    auto SV1 = cast<ShuffleVectorSDNode>(B);
19328    auto SV2 = cast<ShuffleVectorSDNode>(B.getOperand(0));
19329    int NumElts = B.getValueType().getVectorNumElements();
19330    int NumSubElts = NumElts / 4;
19331    for (int I = 0; I < NumSubElts; I++) {
19332      // <0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19>
19333      if (SV1->getMaskElt(I) != I ||
19334          SV1->getMaskElt(I + NumSubElts) != I + NumSubElts ||
19335          SV1->getMaskElt(I + NumSubElts * 2) != I + NumSubElts * 2 ||
19336          SV1->getMaskElt(I + NumSubElts * 3) != I + NumElts)
19337        return false;
19338      // <0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u>
19339      if (SV2->getMaskElt(I) != I ||
19340          SV2->getMaskElt(I + NumSubElts) != I + NumSubElts ||
19341          SV2->getMaskElt(I + NumSubElts * 2) != I + NumElts)
19342        return false;
19343    }
19344    auto *Ld0 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(0));
19345    auto *Ld1 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(1));
19346    auto *Ld2 = dyn_cast<LoadSDNode>(SV2->getOperand(1).getOperand(0));
19347    auto *Ld3 = dyn_cast<LoadSDNode>(B.getOperand(1).getOperand(0));
19348    if (!Ld0 || !Ld1 || !Ld2 || !Ld3 || !Ld0->isSimple() || !Ld1->isSimple() ||
19349        !Ld2->isSimple() || !Ld3->isSimple())
19350      return false;
19351    Loads.push_back(Ld0);
19352    Loads.push_back(Ld1);
19353    Loads.push_back(Ld2);
19354    Loads.push_back(Ld3);
19355    return true;
19356  }
19357  return false;
19358}
19359
19360static bool areLoadedOffsetButOtherwiseSame(SDValue Op0, SDValue Op1,
19361                                            SelectionDAG &DAG,
19362                                            unsigned &NumSubLoads) {
19363  if (!Op0.hasOneUse() || !Op1.hasOneUse())
19364    return false;
19365
19366  SmallVector<LoadSDNode *> Loads0, Loads1;
19367  if (isLoadOrMultipleLoads(Op0, Loads0) &&
19368      isLoadOrMultipleLoads(Op1, Loads1)) {
19369    if (NumSubLoads && Loads0.size() != NumSubLoads)
19370      return false;
19371    NumSubLoads = Loads0.size();
19372    return Loads0.size() == Loads1.size() &&
19373           all_of(zip(Loads0, Loads1), [&DAG](auto L) {
19374             unsigned Size = get<0>(L)->getValueType(0).getSizeInBits();
19375             return Size == get<1>(L)->getValueType(0).getSizeInBits() &&
19376                    DAG.areNonVolatileConsecutiveLoads(get<1>(L), get<0>(L),
19377                                                       Size / 8, 1);
19378           });
19379  }
19380
19381  if (Op0.getOpcode() != Op1.getOpcode())
19382    return false;
19383
19384  switch (Op0.getOpcode()) {
19385  case ISD::ADD:
19386  case ISD::SUB:
19387    return areLoadedOffsetButOtherwiseSame(Op0.getOperand(0), Op1.getOperand(0),
19388                                           DAG, NumSubLoads) &&
19389           areLoadedOffsetButOtherwiseSame(Op0.getOperand(1), Op1.getOperand(1),
19390                                           DAG, NumSubLoads);
19391  case ISD::SIGN_EXTEND:
19392  case ISD::ANY_EXTEND:
19393  case ISD::ZERO_EXTEND:
19394    EVT XVT = Op0.getOperand(0).getValueType();
19395    if (XVT.getScalarSizeInBits() != 8 && XVT.getScalarSizeInBits() != 16 &&
19396        XVT.getScalarSizeInBits() != 32)
19397      return false;
19398    return areLoadedOffsetButOtherwiseSame(Op0.getOperand(0), Op1.getOperand(0),
19399                                           DAG, NumSubLoads);
19400  }
19401  return false;
19402}
19403
19404// This method attempts to fold trees of add(ext(load p), shl(ext(load p+4))
19405// into a single load of twice the size, that we extract the bottom part and top
19406// part so that the shl can use a shll2 instruction. The two loads in that
19407// example can also be larger trees of instructions, which are identical except
19408// for the leaves which are all loads offset from the LHS, including
19409// buildvectors of multiple loads. For example the RHS tree could be
19410// sub(zext(buildvec(load p+4, load q+4)), zext(buildvec(load r+4, load s+4)))
19411// Whilst it can be common for the larger loads to replace LDP instructions
19412// (which doesn't gain anything on it's own), the larger loads can help create
19413// more efficient code, and in buildvectors prevent the need for ld1 lane
19414// inserts which can be slower than normal loads.
19415static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG) {
19416  EVT VT = N->getValueType(0);
19417  if (!VT.isFixedLengthVector() ||
19418      (VT.getScalarSizeInBits() != 16 && VT.getScalarSizeInBits() != 32 &&
19419       VT.getScalarSizeInBits() != 64))
19420    return SDValue();
19421
19422  SDValue Other = N->getOperand(0);
19423  SDValue Shift = N->getOperand(1);
19424  if (Shift.getOpcode() != ISD::SHL && N->getOpcode() != ISD::SUB)
19425    std::swap(Shift, Other);
19426  APInt ShiftAmt;
19427  if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse() ||
19428      !ISD::isConstantSplatVector(Shift.getOperand(1).getNode(), ShiftAmt))
19429    return SDValue();
19430
19431  if (!ISD::isExtOpcode(Shift.getOperand(0).getOpcode()) ||
19432      !ISD::isExtOpcode(Other.getOpcode()) ||
19433      Shift.getOperand(0).getOperand(0).getValueType() !=
19434          Other.getOperand(0).getValueType() ||
19435      !Other.hasOneUse() || !Shift.getOperand(0).hasOneUse())
19436    return SDValue();
19437
19438  SDValue Op0 = Other.getOperand(0);
19439  SDValue Op1 = Shift.getOperand(0).getOperand(0);
19440
19441  unsigned NumSubLoads = 0;
19442  if (!areLoadedOffsetButOtherwiseSame(Op0, Op1, DAG, NumSubLoads))
19443    return SDValue();
19444
19445  // Attempt to rule out some unprofitable cases using heuristics (some working
19446  // around suboptimal code generation), notably if the extend not be able to
19447  // use ushll2 instructions as the types are not large enough. Otherwise zip's
19448  // will need to be created which can increase the instruction count.
19449  unsigned NumElts = Op0.getValueType().getVectorNumElements();
19450  unsigned NumSubElts = NumElts / NumSubLoads;
19451  if (NumSubElts * VT.getScalarSizeInBits() < 128 ||
19452      (Other.getOpcode() != Shift.getOperand(0).getOpcode() &&
19453       Op0.getValueType().getSizeInBits() < 128 &&
19454       !DAG.getTargetLoweringInfo().isTypeLegal(Op0.getValueType())))
19455    return SDValue();
19456
19457  // Recreate the tree with the new combined loads.
19458  std::function<SDValue(SDValue, SDValue, SelectionDAG &)> GenCombinedTree =
19459      [&GenCombinedTree](SDValue Op0, SDValue Op1, SelectionDAG &DAG) {
19460        EVT DVT =
19461            Op0.getValueType().getDoubleNumVectorElementsVT(*DAG.getContext());
19462
19463        SmallVector<LoadSDNode *> Loads0, Loads1;
19464        if (isLoadOrMultipleLoads(Op0, Loads0) &&
19465            isLoadOrMultipleLoads(Op1, Loads1)) {
19466          EVT LoadVT = EVT::getVectorVT(
19467              *DAG.getContext(), Op0.getValueType().getScalarType(),
19468              Op0.getValueType().getVectorNumElements() / Loads0.size());
19469          EVT DLoadVT = LoadVT.getDoubleNumVectorElementsVT(*DAG.getContext());
19470
19471          SmallVector<SDValue> NewLoads;
19472          for (const auto &[L0, L1] : zip(Loads0, Loads1)) {
19473            SDValue Load = DAG.getLoad(DLoadVT, SDLoc(L0), L0->getChain(),
19474                                       L0->getBasePtr(), L0->getPointerInfo(),
19475                                       L0->getOriginalAlign());
19476            DAG.makeEquivalentMemoryOrdering(L0, Load.getValue(1));
19477            DAG.makeEquivalentMemoryOrdering(L1, Load.getValue(1));
19478            NewLoads.push_back(Load);
19479          }
19480          return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op0), DVT, NewLoads);
19481        }
19482
19483        SmallVector<SDValue> Ops;
19484        for (const auto &[O0, O1] : zip(Op0->op_values(), Op1->op_values()))
19485          Ops.push_back(GenCombinedTree(O0, O1, DAG));
19486        return DAG.getNode(Op0.getOpcode(), SDLoc(Op0), DVT, Ops);
19487      };
19488  SDValue NewOp = GenCombinedTree(Op0, Op1, DAG);
19489
19490  SmallVector<int> LowMask(NumElts, 0), HighMask(NumElts, 0);
19491  int Hi = NumSubElts, Lo = 0;
19492  for (unsigned i = 0; i < NumSubLoads; i++) {
19493    for (unsigned j = 0; j < NumSubElts; j++) {
19494      LowMask[i * NumSubElts + j] = Lo++;
19495      HighMask[i * NumSubElts + j] = Hi++;
19496    }
19497    Lo += NumSubElts;
19498    Hi += NumSubElts;
19499  }
19500  SDLoc DL(N);
19501  SDValue Ext0, Ext1;
19502  // Extract the top and bottom lanes, then extend the result. Possibly extend
19503  // the result then extract the lanes if the two operands match as it produces
19504  // slightly smaller code.
19505  if (Other.getOpcode() != Shift.getOperand(0).getOpcode()) {
19506    SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(),
19507                               NewOp, DAG.getConstant(0, DL, MVT::i64));
19508    SDValue SubH =
19509        DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(), NewOp,
19510                    DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
19511    SDValue Extr0 =
19512        DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, LowMask);
19513    SDValue Extr1 =
19514        DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, HighMask);
19515    Ext0 = DAG.getNode(Other.getOpcode(), DL, VT, Extr0);
19516    Ext1 = DAG.getNode(Shift.getOperand(0).getOpcode(), DL, VT, Extr1);
19517  } else {
19518    EVT DVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());
19519    SDValue Ext = DAG.getNode(Other.getOpcode(), DL, DVT, NewOp);
19520    SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
19521                               DAG.getConstant(0, DL, MVT::i64));
19522    SDValue SubH =
19523        DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
19524                    DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
19525    Ext0 = DAG.getVectorShuffle(VT, DL, SubL, SubH, LowMask);
19526    Ext1 = DAG.getVectorShuffle(VT, DL, SubL, SubH, HighMask);
19527  }
19528  SDValue NShift =
19529      DAG.getNode(Shift.getOpcode(), DL, VT, Ext1, Shift.getOperand(1));
19530  return DAG.getNode(N->getOpcode(), DL, VT, Ext0, NShift);
19531}
19532
19533static SDValue performAddSubCombine(SDNode *N,
19534                                    TargetLowering::DAGCombinerInfo &DCI) {
19535  // Try to change sum of two reductions.
19536  if (SDValue Val = performAddUADDVCombine(N, DCI.DAG))
19537    return Val;
19538  if (SDValue Val = performAddDotCombine(N, DCI.DAG))
19539    return Val;
19540  if (SDValue Val = performAddCSelIntoCSinc(N, DCI.DAG))
19541    return Val;
19542  if (SDValue Val = performNegCSelCombine(N, DCI.DAG))
19543    return Val;
19544  if (SDValue Val = performVectorAddSubExtCombine(N, DCI.DAG))
19545    return Val;
19546  if (SDValue Val = performAddCombineForShiftedOperands(N, DCI.DAG))
19547    return Val;
19548  if (SDValue Val = performSubAddMULCombine(N, DCI.DAG))
19549    return Val;
19550  if (SDValue Val = performSVEMulAddSubCombine(N, DCI))
19551    return Val;
19552  if (SDValue Val = performAddSubIntoVectorOp(N, DCI.DAG))
19553    return Val;
19554
19555  if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG))
19556    return Val;
19557
19558  return performAddSubLongCombine(N, DCI);
19559}
19560
19561// Massage DAGs which we can use the high-half "long" operations on into
19562// something isel will recognize better. E.g.
19563//
19564// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
19565//   (aarch64_neon_umull (extract_high (v2i64 vec)))
19566//                     (extract_high (v2i64 (dup128 scalar)))))
19567//
19568static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
19569                                       TargetLowering::DAGCombinerInfo &DCI,
19570                                       SelectionDAG &DAG) {
19571  if (DCI.isBeforeLegalizeOps())
19572    return SDValue();
19573
19574  SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1);
19575  SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2);
19576  assert(LHS.getValueType().is64BitVector() &&
19577         RHS.getValueType().is64BitVector() &&
19578         "unexpected shape for long operation");
19579
19580  // Either node could be a DUP, but it's not worth doing both of them (you'd
19581  // just as well use the non-high version) so look for a corresponding extract
19582  // operation on the other "wing".
19583  if (isEssentiallyExtractHighSubvector(LHS)) {
19584    RHS = tryExtendDUPToExtractHigh(RHS, DAG);
19585    if (!RHS.getNode())
19586      return SDValue();
19587  } else if (isEssentiallyExtractHighSubvector(RHS)) {
19588    LHS = tryExtendDUPToExtractHigh(LHS, DAG);
19589    if (!LHS.getNode())
19590      return SDValue();
19591  } else
19592    return SDValue();
19593
19594  if (IID == Intrinsic::not_intrinsic)
19595    return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS);
19596
19597  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
19598                     N->getOperand(0), LHS, RHS);
19599}
19600
19601static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
19602  MVT ElemTy = N->getSimpleValueType(0).getScalarType();
19603  unsigned ElemBits = ElemTy.getSizeInBits();
19604
19605  int64_t ShiftAmount;
19606  if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
19607    APInt SplatValue, SplatUndef;
19608    unsigned SplatBitSize;
19609    bool HasAnyUndefs;
19610    if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
19611                              HasAnyUndefs, ElemBits) ||
19612        SplatBitSize != ElemBits)
19613      return SDValue();
19614
19615    ShiftAmount = SplatValue.getSExtValue();
19616  } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
19617    ShiftAmount = CVN->getSExtValue();
19618  } else
19619    return SDValue();
19620
19621  // If the shift amount is zero, remove the shift intrinsic.
19622  if (ShiftAmount == 0 && IID != Intrinsic::aarch64_neon_sqshlu)
19623    return N->getOperand(1);
19624
19625  unsigned Opcode;
19626  bool IsRightShift;
19627  switch (IID) {
19628  default:
19629    llvm_unreachable("Unknown shift intrinsic");
19630  case Intrinsic::aarch64_neon_sqshl:
19631    Opcode = AArch64ISD::SQSHL_I;
19632    IsRightShift = false;
19633    break;
19634  case Intrinsic::aarch64_neon_uqshl:
19635    Opcode = AArch64ISD::UQSHL_I;
19636    IsRightShift = false;
19637    break;
19638  case Intrinsic::aarch64_neon_srshl:
19639    Opcode = AArch64ISD::SRSHR_I;
19640    IsRightShift = true;
19641    break;
19642  case Intrinsic::aarch64_neon_urshl:
19643    Opcode = AArch64ISD::URSHR_I;
19644    IsRightShift = true;
19645    break;
19646  case Intrinsic::aarch64_neon_sqshlu:
19647    Opcode = AArch64ISD::SQSHLU_I;
19648    IsRightShift = false;
19649    break;
19650  case Intrinsic::aarch64_neon_sshl:
19651  case Intrinsic::aarch64_neon_ushl:
19652    // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
19653    // left shift for positive shift amounts. For negative shifts we can use a
19654    // VASHR/VLSHR as appropiate.
19655    if (ShiftAmount < 0) {
19656      Opcode = IID == Intrinsic::aarch64_neon_sshl ? AArch64ISD::VASHR
19657                                                   : AArch64ISD::VLSHR;
19658      ShiftAmount = -ShiftAmount;
19659    } else
19660      Opcode = AArch64ISD::VSHL;
19661    IsRightShift = false;
19662    break;
19663  }
19664
19665  EVT VT = N->getValueType(0);
19666  SDValue Op = N->getOperand(1);
19667  SDLoc dl(N);
19668  if (VT == MVT::i64) {
19669    Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op);
19670    VT = MVT::v1i64;
19671  }
19672
19673  if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
19674    Op = DAG.getNode(Opcode, dl, VT, Op,
19675                     DAG.getConstant(-ShiftAmount, dl, MVT::i32));
19676    if (N->getValueType(0) == MVT::i64)
19677      Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
19678                       DAG.getConstant(0, dl, MVT::i64));
19679    return Op;
19680  } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
19681    Op = DAG.getNode(Opcode, dl, VT, Op,
19682                     DAG.getConstant(ShiftAmount, dl, MVT::i32));
19683    if (N->getValueType(0) == MVT::i64)
19684      Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
19685                       DAG.getConstant(0, dl, MVT::i64));
19686    return Op;
19687  }
19688
19689  return SDValue();
19690}
19691
19692// The CRC32[BH] instructions ignore the high bits of their data operand. Since
19693// the intrinsics must be legal and take an i32, this means there's almost
19694// certainly going to be a zext in the DAG which we can eliminate.
19695static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
19696  SDValue AndN = N->getOperand(2);
19697  if (AndN.getOpcode() != ISD::AND)
19698    return SDValue();
19699
19700  ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
19701  if (!CMask || CMask->getZExtValue() != Mask)
19702    return SDValue();
19703
19704  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
19705                     N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
19706}
19707
19708static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N,
19709                                           SelectionDAG &DAG) {
19710  SDLoc dl(N);
19711  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
19712                     DAG.getNode(Opc, dl,
19713                                 N->getOperand(1).getSimpleValueType(),
19714                                 N->getOperand(1)),
19715                     DAG.getConstant(0, dl, MVT::i64));
19716}
19717
19718static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG) {
19719  SDLoc DL(N);
19720  SDValue Op1 = N->getOperand(1);
19721  SDValue Op2 = N->getOperand(2);
19722  EVT ScalarTy = Op2.getValueType();
19723  if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
19724    ScalarTy = MVT::i32;
19725
19726  // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
19727  SDValue StepVector = DAG.getStepVector(DL, N->getValueType(0));
19728  SDValue Step = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op2);
19729  SDValue Mul = DAG.getNode(ISD::MUL, DL, N->getValueType(0), StepVector, Step);
19730  SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1);
19731  return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Mul, Base);
19732}
19733
19734static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG) {
19735  SDLoc dl(N);
19736  SDValue Scalar = N->getOperand(3);
19737  EVT ScalarTy = Scalar.getValueType();
19738
19739  if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
19740    Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
19741
19742  SDValue Passthru = N->getOperand(1);
19743  SDValue Pred = N->getOperand(2);
19744  return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0),
19745                     Pred, Scalar, Passthru);
19746}
19747
19748static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) {
19749  SDLoc dl(N);
19750  LLVMContext &Ctx = *DAG.getContext();
19751  EVT VT = N->getValueType(0);
19752
19753  assert(VT.isScalableVector() && "Expected a scalable vector.");
19754
19755  // Current lowering only supports the SVE-ACLE types.
19756  if (VT.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock)
19757    return SDValue();
19758
19759  unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
19760  unsigned ByteSize = VT.getSizeInBits().getKnownMinValue() / 8;
19761  EVT ByteVT =
19762      EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize));
19763
19764  // Convert everything to the domain of EXT (i.e bytes).
19765  SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(1));
19766  SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(2));
19767  SDValue Op2 = DAG.getNode(ISD::MUL, dl, MVT::i32, N->getOperand(3),
19768                            DAG.getConstant(ElemSize, dl, MVT::i32));
19769
19770  SDValue EXT = DAG.getNode(AArch64ISD::EXT, dl, ByteVT, Op0, Op1, Op2);
19771  return DAG.getNode(ISD::BITCAST, dl, VT, EXT);
19772}
19773
19774static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC,
19775                                        TargetLowering::DAGCombinerInfo &DCI,
19776                                        SelectionDAG &DAG) {
19777  if (DCI.isBeforeLegalize())
19778    return SDValue();
19779
19780  SDValue Comparator = N->getOperand(3);
19781  if (Comparator.getOpcode() == AArch64ISD::DUP ||
19782      Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
19783    unsigned IID = getIntrinsicID(N);
19784    EVT VT = N->getValueType(0);
19785    EVT CmpVT = N->getOperand(2).getValueType();
19786    SDValue Pred = N->getOperand(1);
19787    SDValue Imm;
19788    SDLoc DL(N);
19789
19790    switch (IID) {
19791    default:
19792      llvm_unreachable("Called with wrong intrinsic!");
19793      break;
19794
19795    // Signed comparisons
19796    case Intrinsic::aarch64_sve_cmpeq_wide:
19797    case Intrinsic::aarch64_sve_cmpne_wide:
19798    case Intrinsic::aarch64_sve_cmpge_wide:
19799    case Intrinsic::aarch64_sve_cmpgt_wide:
19800    case Intrinsic::aarch64_sve_cmplt_wide:
19801    case Intrinsic::aarch64_sve_cmple_wide: {
19802      if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
19803        int64_t ImmVal = CN->getSExtValue();
19804        if (ImmVal >= -16 && ImmVal <= 15)
19805          Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
19806        else
19807          return SDValue();
19808      }
19809      break;
19810    }
19811    // Unsigned comparisons
19812    case Intrinsic::aarch64_sve_cmphs_wide:
19813    case Intrinsic::aarch64_sve_cmphi_wide:
19814    case Intrinsic::aarch64_sve_cmplo_wide:
19815    case Intrinsic::aarch64_sve_cmpls_wide:  {
19816      if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
19817        uint64_t ImmVal = CN->getZExtValue();
19818        if (ImmVal <= 127)
19819          Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
19820        else
19821          return SDValue();
19822      }
19823      break;
19824    }
19825    }
19826
19827    if (!Imm)
19828      return SDValue();
19829
19830    SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
19831    return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
19832                       N->getOperand(2), Splat, DAG.getCondCode(CC));
19833  }
19834
19835  return SDValue();
19836}
19837
19838static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
19839                        AArch64CC::CondCode Cond) {
19840  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19841
19842  SDLoc DL(Op);
19843  assert(Op.getValueType().isScalableVector() &&
19844         TLI.isTypeLegal(Op.getValueType()) &&
19845         "Expected legal scalable vector type!");
19846  assert(Op.getValueType() == Pg.getValueType() &&
19847         "Expected same type for PTEST operands");
19848
19849  // Ensure target specific opcodes are using legal type.
19850  EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
19851  SDValue TVal = DAG.getConstant(1, DL, OutVT);
19852  SDValue FVal = DAG.getConstant(0, DL, OutVT);
19853
19854  // Ensure operands have type nxv16i1.
19855  if (Op.getValueType() != MVT::nxv16i1) {
19856    if ((Cond == AArch64CC::ANY_ACTIVE || Cond == AArch64CC::NONE_ACTIVE) &&
19857        isZeroingInactiveLanes(Op))
19858      Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pg);
19859    else
19860      Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG);
19861    Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op);
19862  }
19863
19864  // Set condition code (CC) flags.
19865  SDValue Test = DAG.getNode(
19866      Cond == AArch64CC::ANY_ACTIVE ? AArch64ISD::PTEST_ANY : AArch64ISD::PTEST,
19867      DL, MVT::Other, Pg, Op);
19868
19869  // Convert CC to integer based on requested condition.
19870  // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
19871  SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32);
19872  SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
19873  return DAG.getZExtOrTrunc(Res, DL, VT);
19874}
19875
19876static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc,
19877                                      SelectionDAG &DAG) {
19878  SDLoc DL(N);
19879
19880  SDValue Pred = N->getOperand(1);
19881  SDValue VecToReduce = N->getOperand(2);
19882
19883  // NOTE: The integer reduction's result type is not always linked to the
19884  // operand's element type so we construct it from the intrinsic's result type.
19885  EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0));
19886  SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
19887
19888  // SVE reductions set the whole vector register with the first element
19889  // containing the reduction result, which we'll now extract.
19890  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
19891  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
19892                     Zero);
19893}
19894
19895static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc,
19896                                     SelectionDAG &DAG) {
19897  SDLoc DL(N);
19898
19899  SDValue Pred = N->getOperand(1);
19900  SDValue VecToReduce = N->getOperand(2);
19901
19902  EVT ReduceVT = VecToReduce.getValueType();
19903  SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
19904
19905  // SVE reductions set the whole vector register with the first element
19906  // containing the reduction result, which we'll now extract.
19907  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
19908  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
19909                     Zero);
19910}
19911
19912static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc,
19913                                            SelectionDAG &DAG) {
19914  SDLoc DL(N);
19915
19916  SDValue Pred = N->getOperand(1);
19917  SDValue InitVal = N->getOperand(2);
19918  SDValue VecToReduce = N->getOperand(3);
19919  EVT ReduceVT = VecToReduce.getValueType();
19920
19921  // Ordered reductions use the first lane of the result vector as the
19922  // reduction's initial value.
19923  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
19924  InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT,
19925                        DAG.getUNDEF(ReduceVT), InitVal, Zero);
19926
19927  SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);
19928
19929  // SVE reductions set the whole vector register with the first element
19930  // containing the reduction result, which we'll now extract.
19931  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
19932                     Zero);
19933}
19934
19935// If a merged operation has no inactive lanes we can relax it to a predicated
19936// or unpredicated operation, which potentially allows better isel (perhaps
19937// using immediate forms) or relaxing register reuse requirements.
19938static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc,
19939                                       SelectionDAG &DAG, bool UnpredOp = false,
19940                                       bool SwapOperands = false) {
19941  assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
19942  assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
19943  SDValue Pg = N->getOperand(1);
19944  SDValue Op1 = N->getOperand(SwapOperands ? 3 : 2);
19945  SDValue Op2 = N->getOperand(SwapOperands ? 2 : 3);
19946
19947  // ISD way to specify an all active predicate.
19948  if (isAllActivePredicate(DAG, Pg)) {
19949    if (UnpredOp)
19950      return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op1, Op2);
19951
19952    return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, Op1, Op2);
19953  }
19954
19955  // FUTURE: SplatVector(true)
19956  return SDValue();
19957}
19958
19959static SDValue performIntrinsicCombine(SDNode *N,
19960                                       TargetLowering::DAGCombinerInfo &DCI,
19961                                       const AArch64Subtarget *Subtarget) {
19962  SelectionDAG &DAG = DCI.DAG;
19963  unsigned IID = getIntrinsicID(N);
19964  switch (IID) {
19965  default:
19966    break;
19967  case Intrinsic::get_active_lane_mask: {
19968    SDValue Res = SDValue();
19969    EVT VT = N->getValueType(0);
19970    if (VT.isFixedLengthVector()) {
19971      // We can use the SVE whilelo instruction to lower this intrinsic by
19972      // creating the appropriate sequence of scalable vector operations and
19973      // then extracting a fixed-width subvector from the scalable vector.
19974
19975      SDLoc DL(N);
19976      SDValue ID =
19977          DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
19978
19979      EVT WhileVT = EVT::getVectorVT(
19980          *DAG.getContext(), MVT::i1,
19981          ElementCount::getScalable(VT.getVectorNumElements()));
19982
19983      // Get promoted scalable vector VT, i.e. promote nxv4i1 -> nxv4i32.
19984      EVT PromVT = getPromotedVTForPredicate(WhileVT);
19985
19986      // Get the fixed-width equivalent of PromVT for extraction.
19987      EVT ExtVT =
19988          EVT::getVectorVT(*DAG.getContext(), PromVT.getVectorElementType(),
19989                           VT.getVectorElementCount());
19990
19991      Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WhileVT, ID,
19992                        N->getOperand(1), N->getOperand(2));
19993      Res = DAG.getNode(ISD::SIGN_EXTEND, DL, PromVT, Res);
19994      Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, Res,
19995                        DAG.getConstant(0, DL, MVT::i64));
19996      Res = DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
19997    }
19998    return Res;
19999  }
20000  case Intrinsic::aarch64_neon_vcvtfxs2fp:
20001  case Intrinsic::aarch64_neon_vcvtfxu2fp:
20002    return tryCombineFixedPointConvert(N, DCI, DAG);
20003  case Intrinsic::aarch64_neon_saddv:
20004    return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG);
20005  case Intrinsic::aarch64_neon_uaddv:
20006    return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG);
20007  case Intrinsic::aarch64_neon_sminv:
20008    return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG);
20009  case Intrinsic::aarch64_neon_uminv:
20010    return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG);
20011  case Intrinsic::aarch64_neon_smaxv:
20012    return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG);
20013  case Intrinsic::aarch64_neon_umaxv:
20014    return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
20015  case Intrinsic::aarch64_neon_fmax:
20016    return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
20017                       N->getOperand(1), N->getOperand(2));
20018  case Intrinsic::aarch64_neon_fmin:
20019    return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
20020                       N->getOperand(1), N->getOperand(2));
20021  case Intrinsic::aarch64_neon_fmaxnm:
20022    return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
20023                       N->getOperand(1), N->getOperand(2));
20024  case Intrinsic::aarch64_neon_fminnm:
20025    return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
20026                       N->getOperand(1), N->getOperand(2));
20027  case Intrinsic::aarch64_neon_smull:
20028    return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0),
20029                       N->getOperand(1), N->getOperand(2));
20030  case Intrinsic::aarch64_neon_umull:
20031    return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0),
20032                       N->getOperand(1), N->getOperand(2));
20033  case Intrinsic::aarch64_neon_pmull:
20034    return DAG.getNode(AArch64ISD::PMULL, SDLoc(N), N->getValueType(0),
20035                       N->getOperand(1), N->getOperand(2));
20036  case Intrinsic::aarch64_neon_sqdmull:
20037    return tryCombineLongOpWithDup(IID, N, DCI, DAG);
20038  case Intrinsic::aarch64_neon_sqshl:
20039  case Intrinsic::aarch64_neon_uqshl:
20040  case Intrinsic::aarch64_neon_sqshlu:
20041  case Intrinsic::aarch64_neon_srshl:
20042  case Intrinsic::aarch64_neon_urshl:
20043  case Intrinsic::aarch64_neon_sshl:
20044  case Intrinsic::aarch64_neon_ushl:
20045    return tryCombineShiftImm(IID, N, DAG);
20046  case Intrinsic::aarch64_neon_sabd:
20047    return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
20048                       N->getOperand(1), N->getOperand(2));
20049  case Intrinsic::aarch64_neon_uabd:
20050    return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
20051                       N->getOperand(1), N->getOperand(2));
20052  case Intrinsic::aarch64_crc32b:
20053  case Intrinsic::aarch64_crc32cb:
20054    return tryCombineCRC32(0xff, N, DAG);
20055  case Intrinsic::aarch64_crc32h:
20056  case Intrinsic::aarch64_crc32ch:
20057    return tryCombineCRC32(0xffff, N, DAG);
20058  case Intrinsic::aarch64_sve_saddv:
20059    // There is no i64 version of SADDV because the sign is irrelevant.
20060    if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64)
20061      return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
20062    else
20063      return combineSVEReductionInt(N, AArch64ISD::SADDV_PRED, DAG);
20064  case Intrinsic::aarch64_sve_uaddv:
20065    return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
20066  case Intrinsic::aarch64_sve_smaxv:
20067    return combineSVEReductionInt(N, AArch64ISD::SMAXV_PRED, DAG);
20068  case Intrinsic::aarch64_sve_umaxv:
20069    return combineSVEReductionInt(N, AArch64ISD::UMAXV_PRED, DAG);
20070  case Intrinsic::aarch64_sve_sminv:
20071    return combineSVEReductionInt(N, AArch64ISD::SMINV_PRED, DAG);
20072  case Intrinsic::aarch64_sve_uminv:
20073    return combineSVEReductionInt(N, AArch64ISD::UMINV_PRED, DAG);
20074  case Intrinsic::aarch64_sve_orv:
20075    return combineSVEReductionInt(N, AArch64ISD::ORV_PRED, DAG);
20076  case Intrinsic::aarch64_sve_eorv:
20077    return combineSVEReductionInt(N, AArch64ISD::EORV_PRED, DAG);
20078  case Intrinsic::aarch64_sve_andv:
20079    return combineSVEReductionInt(N, AArch64ISD::ANDV_PRED, DAG);
20080  case Intrinsic::aarch64_sve_index:
20081    return LowerSVEIntrinsicIndex(N, DAG);
20082  case Intrinsic::aarch64_sve_dup:
20083    return LowerSVEIntrinsicDUP(N, DAG);
20084  case Intrinsic::aarch64_sve_dup_x:
20085    return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
20086                       N->getOperand(1));
20087  case Intrinsic::aarch64_sve_ext:
20088    return LowerSVEIntrinsicEXT(N, DAG);
20089  case Intrinsic::aarch64_sve_mul_u:
20090    return DAG.getNode(AArch64ISD::MUL_PRED, SDLoc(N), N->getValueType(0),
20091                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
20092  case Intrinsic::aarch64_sve_smulh_u:
20093    return DAG.getNode(AArch64ISD::MULHS_PRED, SDLoc(N), N->getValueType(0),
20094                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
20095  case Intrinsic::aarch64_sve_umulh_u:
20096    return DAG.getNode(AArch64ISD::MULHU_PRED, SDLoc(N), N->getValueType(0),
20097                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
20098  case Intrinsic::aarch64_sve_smin_u:
20099    return DAG.getNode(AArch64ISD::SMIN_PRED, SDLoc(N), N->getValueType(0),
20100                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
20101  case Intrinsic::aarch64_sve_umin_u:
20102    return DAG.getNode(AArch64ISD::UMIN_PRED, SDLoc(N), N->getValueType(0),
20103                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
20104  case Intrinsic::aarch64_sve_smax_u:
20105    return DAG.getNode(AArch64ISD::SMAX_PRED, SDLoc(N), N->getValueType(0),
20106                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
20107  case Intrinsic::aarch64_sve_umax_u:
20108    return DAG.getNode(AArch64ISD::UMAX_PRED, SDLoc(N), N->getValueType(0),
20109                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
20110  case Intrinsic::aarch64_sve_lsl_u:
20111    return DAG.getNode(AArch64ISD::SHL_PRED, SDLoc(N), N->getValueType(0),
20112                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
20113  case Intrinsic::aarch64_sve_lsr_u:
20114    return DAG.getNode(AArch64ISD::SRL_PRED, SDLoc(N), N->getValueType(0),
20115                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
20116  case Intrinsic::aarch64_sve_asr_u:
20117    return DAG.getNode(AArch64ISD::SRA_PRED, SDLoc(N), N->getValueType(0),
20118                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
20119  case Intrinsic::aarch64_sve_fadd_u:
20120    return DAG.getNode(AArch64ISD::FADD_PRED, SDLoc(N), N->getValueType(0),
20121                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
20122  case Intrinsic::aarch64_sve_fdiv_u:
20123    return DAG.getNode(AArch64ISD::FDIV_PRED, SDLoc(N), N->getValueType(0),
20124                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
20125  case Intrinsic::aarch64_sve_fmax_u:
20126    return DAG.getNode(AArch64ISD::FMAX_PRED, SDLoc(N), N->getValueType(0),
20127                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
20128  case Intrinsic::aarch64_sve_fmaxnm_u:
20129    return DAG.getNode(AArch64ISD::FMAXNM_PRED, SDLoc(N), N->getValueType(0),
20130                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
20131  case Intrinsic::aarch64_sve_fmla_u:
20132    return DAG.getNode(AArch64ISD::FMA_PRED, SDLoc(N), N->getValueType(0),
20133                       N->getOperand(1), N->getOperand(3), N->getOperand(4),
20134                       N->getOperand(2));
20135  case Intrinsic::aarch64_sve_fmin_u:
20136    return DAG.getNode(AArch64ISD::FMIN_PRED, SDLoc(N), N->getValueType(0),
20137                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
20138  case Intrinsic::aarch64_sve_fminnm_u:
20139    return DAG.getNode(AArch64ISD::FMINNM_PRED, SDLoc(N), N->getValueType(0),
20140                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
20141  case Intrinsic::aarch64_sve_fmul_u:
20142    return DAG.getNode(AArch64ISD::FMUL_PRED, SDLoc(N), N->getValueType(0),
20143                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
20144  case Intrinsic::aarch64_sve_fsub_u:
20145    return DAG.getNode(AArch64ISD::FSUB_PRED, SDLoc(N), N->getValueType(0),
20146                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
20147  case Intrinsic::aarch64_sve_add_u:
20148    return DAG.getNode(ISD::ADD, SDLoc(N), N->getValueType(0), N->getOperand(2),
20149                       N->getOperand(3));
20150  case Intrinsic::aarch64_sve_sub_u:
20151    return DAG.getNode(ISD::SUB, SDLoc(N), N->getValueType(0), N->getOperand(2),
20152                       N->getOperand(3));
20153  case Intrinsic::aarch64_sve_subr:
20154    return convertMergedOpToPredOp(N, ISD::SUB, DAG, true, true);
20155  case Intrinsic::aarch64_sve_and_u:
20156    return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0), N->getOperand(2),
20157                       N->getOperand(3));
20158  case Intrinsic::aarch64_sve_bic_u:
20159    return DAG.getNode(AArch64ISD::BIC, SDLoc(N), N->getValueType(0),
20160                       N->getOperand(2), N->getOperand(3));
20161  case Intrinsic::aarch64_sve_eor_u:
20162    return DAG.getNode(ISD::XOR, SDLoc(N), N->getValueType(0), N->getOperand(2),
20163                       N->getOperand(3));
20164  case Intrinsic::aarch64_sve_orr_u:
20165    return DAG.getNode(ISD::OR, SDLoc(N), N->getValueType(0), N->getOperand(2),
20166                       N->getOperand(3));
20167  case Intrinsic::aarch64_sve_sabd_u:
20168    return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
20169                       N->getOperand(2), N->getOperand(3));
20170  case Intrinsic::aarch64_sve_uabd_u:
20171    return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
20172                       N->getOperand(2), N->getOperand(3));
20173  case Intrinsic::aarch64_sve_sdiv_u:
20174    return DAG.getNode(AArch64ISD::SDIV_PRED, SDLoc(N), N->getValueType(0),
20175                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
20176  case Intrinsic::aarch64_sve_udiv_u:
20177    return DAG.getNode(AArch64ISD::UDIV_PRED, SDLoc(N), N->getValueType(0),
20178                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
20179  case Intrinsic::aarch64_sve_sqadd:
20180    return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true);
20181  case Intrinsic::aarch64_sve_sqsub_u:
20182    return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
20183                       N->getOperand(2), N->getOperand(3));
20184  case Intrinsic::aarch64_sve_uqadd:
20185    return convertMergedOpToPredOp(N, ISD::UADDSAT, DAG, true);
20186  case Intrinsic::aarch64_sve_uqsub_u:
20187    return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
20188                       N->getOperand(2), N->getOperand(3));
20189  case Intrinsic::aarch64_sve_sqadd_x:
20190    return DAG.getNode(ISD::SADDSAT, SDLoc(N), N->getValueType(0),
20191                       N->getOperand(1), N->getOperand(2));
20192  case Intrinsic::aarch64_sve_sqsub_x:
20193    return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
20194                       N->getOperand(1), N->getOperand(2));
20195  case Intrinsic::aarch64_sve_uqadd_x:
20196    return DAG.getNode(ISD::UADDSAT, SDLoc(N), N->getValueType(0),
20197                       N->getOperand(1), N->getOperand(2));
20198  case Intrinsic::aarch64_sve_uqsub_x:
20199    return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
20200                       N->getOperand(1), N->getOperand(2));
20201  case Intrinsic::aarch64_sve_asrd:
20202    return DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, SDLoc(N), N->getValueType(0),
20203                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
20204  case Intrinsic::aarch64_sve_cmphs:
20205    if (!N->getOperand(2).getValueType().isFloatingPoint())
20206      return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
20207                         N->getValueType(0), N->getOperand(1), N->getOperand(2),
20208                         N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
20209    break;
20210  case Intrinsic::aarch64_sve_cmphi:
20211    if (!N->getOperand(2).getValueType().isFloatingPoint())
20212      return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
20213                         N->getValueType(0), N->getOperand(1), N->getOperand(2),
20214                         N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
20215    break;
20216  case Intrinsic::aarch64_sve_fcmpge:
20217  case Intrinsic::aarch64_sve_cmpge:
20218    return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
20219                       N->getValueType(0), N->getOperand(1), N->getOperand(2),
20220                       N->getOperand(3), DAG.getCondCode(ISD::SETGE));
20221    break;
20222  case Intrinsic::aarch64_sve_fcmpgt:
20223  case Intrinsic::aarch64_sve_cmpgt:
20224    return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
20225                       N->getValueType(0), N->getOperand(1), N->getOperand(2),
20226                       N->getOperand(3), DAG.getCondCode(ISD::SETGT));
20227    break;
20228  case Intrinsic::aarch64_sve_fcmpeq:
20229  case Intrinsic::aarch64_sve_cmpeq:
20230    return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
20231                       N->getValueType(0), N->getOperand(1), N->getOperand(2),
20232                       N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
20233    break;
20234  case Intrinsic::aarch64_sve_fcmpne:
20235  case Intrinsic::aarch64_sve_cmpne:
20236    return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
20237                       N->getValueType(0), N->getOperand(1), N->getOperand(2),
20238                       N->getOperand(3), DAG.getCondCode(ISD::SETNE));
20239    break;
20240  case Intrinsic::aarch64_sve_fcmpuo:
20241    return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
20242                       N->getValueType(0), N->getOperand(1), N->getOperand(2),
20243                       N->getOperand(3), DAG.getCondCode(ISD::SETUO));
20244    break;
20245  case Intrinsic::aarch64_sve_fadda:
20246    return combineSVEReductionOrderedFP(N, AArch64ISD::FADDA_PRED, DAG);
20247  case Intrinsic::aarch64_sve_faddv:
20248    return combineSVEReductionFP(N, AArch64ISD::FADDV_PRED, DAG);
20249  case Intrinsic::aarch64_sve_fmaxnmv:
20250    return combineSVEReductionFP(N, AArch64ISD::FMAXNMV_PRED, DAG);
20251  case Intrinsic::aarch64_sve_fmaxv:
20252    return combineSVEReductionFP(N, AArch64ISD::FMAXV_PRED, DAG);
20253  case Intrinsic::aarch64_sve_fminnmv:
20254    return combineSVEReductionFP(N, AArch64ISD::FMINNMV_PRED, DAG);
20255  case Intrinsic::aarch64_sve_fminv:
20256    return combineSVEReductionFP(N, AArch64ISD::FMINV_PRED, DAG);
20257  case Intrinsic::aarch64_sve_sel:
20258    return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
20259                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
20260  case Intrinsic::aarch64_sve_cmpeq_wide:
20261    return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG);
20262  case Intrinsic::aarch64_sve_cmpne_wide:
20263    return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG);
20264  case Intrinsic::aarch64_sve_cmpge_wide:
20265    return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG);
20266  case Intrinsic::aarch64_sve_cmpgt_wide:
20267    return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG);
20268  case Intrinsic::aarch64_sve_cmplt_wide:
20269    return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG);
20270  case Intrinsic::aarch64_sve_cmple_wide:
20271    return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG);
20272  case Intrinsic::aarch64_sve_cmphs_wide:
20273    return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG);
20274  case Intrinsic::aarch64_sve_cmphi_wide:
20275    return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG);
20276  case Intrinsic::aarch64_sve_cmplo_wide:
20277    return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG);
20278  case Intrinsic::aarch64_sve_cmpls_wide:
20279    return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG);
20280  case Intrinsic::aarch64_sve_ptest_any:
20281    return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
20282                    AArch64CC::ANY_ACTIVE);
20283  case Intrinsic::aarch64_sve_ptest_first:
20284    return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
20285                    AArch64CC::FIRST_ACTIVE);
20286  case Intrinsic::aarch64_sve_ptest_last:
20287    return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
20288                    AArch64CC::LAST_ACTIVE);
20289  }
20290  return SDValue();
20291}
20292
20293static bool isCheapToExtend(const SDValue &N) {
20294  unsigned OC = N->getOpcode();
20295  return OC == ISD::LOAD || OC == ISD::MLOAD ||
20296         ISD::isConstantSplatVectorAllZeros(N.getNode());
20297}
20298
20299static SDValue
20300performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
20301                              SelectionDAG &DAG) {
20302  // If we have (sext (setcc A B)) and A and B are cheap to extend,
20303  // we can move the sext into the arguments and have the same result. For
20304  // example, if A and B are both loads, we can make those extending loads and
20305  // avoid an extra instruction. This pattern appears often in VLS code
20306  // generation where the inputs to the setcc have a different size to the
20307  // instruction that wants to use the result of the setcc.
20308  assert(N->getOpcode() == ISD::SIGN_EXTEND &&
20309         N->getOperand(0)->getOpcode() == ISD::SETCC);
20310  const SDValue SetCC = N->getOperand(0);
20311
20312  const SDValue CCOp0 = SetCC.getOperand(0);
20313  const SDValue CCOp1 = SetCC.getOperand(1);
20314  if (!CCOp0->getValueType(0).isInteger() ||
20315      !CCOp1->getValueType(0).isInteger())
20316    return SDValue();
20317
20318  ISD::CondCode Code =
20319      cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get();
20320
20321  ISD::NodeType ExtType =
20322      isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
20323
20324  if (isCheapToExtend(SetCC.getOperand(0)) &&
20325      isCheapToExtend(SetCC.getOperand(1))) {
20326    const SDValue Ext1 =
20327        DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp0);
20328    const SDValue Ext2 =
20329        DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp1);
20330
20331    return DAG.getSetCC(
20332        SDLoc(SetCC), N->getValueType(0), Ext1, Ext2,
20333        cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get());
20334  }
20335
20336  return SDValue();
20337}
20338
20339static SDValue performExtendCombine(SDNode *N,
20340                                    TargetLowering::DAGCombinerInfo &DCI,
20341                                    SelectionDAG &DAG) {
20342  // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
20343  // we can convert that DUP into another extract_high (of a bigger DUP), which
20344  // helps the backend to decide that an sabdl2 would be useful, saving a real
20345  // extract_high operation.
20346  if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
20347      (N->getOperand(0).getOpcode() == ISD::ABDU ||
20348       N->getOperand(0).getOpcode() == ISD::ABDS)) {
20349    SDNode *ABDNode = N->getOperand(0).getNode();
20350    SDValue NewABD =
20351        tryCombineLongOpWithDup(Intrinsic::not_intrinsic, ABDNode, DCI, DAG);
20352    if (!NewABD.getNode())
20353      return SDValue();
20354
20355    return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
20356  }
20357
20358  if (N->getValueType(0).isFixedLengthVector() &&
20359      N->getOpcode() == ISD::SIGN_EXTEND &&
20360      N->getOperand(0)->getOpcode() == ISD::SETCC)
20361    return performSignExtendSetCCCombine(N, DCI, DAG);
20362
20363  return SDValue();
20364}
20365
20366static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
20367                               SDValue SplatVal, unsigned NumVecElts) {
20368  assert(!St.isTruncatingStore() && "cannot split truncating vector store");
20369  Align OrigAlignment = St.getAlign();
20370  unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
20371
20372  // Create scalar stores. This is at least as good as the code sequence for a
20373  // split unaligned store which is a dup.s, ext.b, and two stores.
20374  // Most of the time the three stores should be replaced by store pair
20375  // instructions (stp).
20376  SDLoc DL(&St);
20377  SDValue BasePtr = St.getBasePtr();
20378  uint64_t BaseOffset = 0;
20379
20380  const MachinePointerInfo &PtrInfo = St.getPointerInfo();
20381  SDValue NewST1 =
20382      DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
20383                   OrigAlignment, St.getMemOperand()->getFlags());
20384
20385  // As this in ISel, we will not merge this add which may degrade results.
20386  if (BasePtr->getOpcode() == ISD::ADD &&
20387      isa<ConstantSDNode>(BasePtr->getOperand(1))) {
20388    BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
20389    BasePtr = BasePtr->getOperand(0);
20390  }
20391
20392  unsigned Offset = EltOffset;
20393  while (--NumVecElts) {
20394    Align Alignment = commonAlignment(OrigAlignment, Offset);
20395    SDValue OffsetPtr =
20396        DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
20397                    DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
20398    NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
20399                          PtrInfo.getWithOffset(Offset), Alignment,
20400                          St.getMemOperand()->getFlags());
20401    Offset += EltOffset;
20402  }
20403  return NewST1;
20404}
20405
20406// Returns an SVE type that ContentTy can be trivially sign or zero extended
20407// into.
20408static MVT getSVEContainerType(EVT ContentTy) {
20409  assert(ContentTy.isSimple() && "No SVE containers for extended types");
20410
20411  switch (ContentTy.getSimpleVT().SimpleTy) {
20412  default:
20413    llvm_unreachable("No known SVE container for this MVT type");
20414  case MVT::nxv2i8:
20415  case MVT::nxv2i16:
20416  case MVT::nxv2i32:
20417  case MVT::nxv2i64:
20418  case MVT::nxv2f32:
20419  case MVT::nxv2f64:
20420    return MVT::nxv2i64;
20421  case MVT::nxv4i8:
20422  case MVT::nxv4i16:
20423  case MVT::nxv4i32:
20424  case MVT::nxv4f32:
20425    return MVT::nxv4i32;
20426  case MVT::nxv8i8:
20427  case MVT::nxv8i16:
20428  case MVT::nxv8f16:
20429  case MVT::nxv8bf16:
20430    return MVT::nxv8i16;
20431  case MVT::nxv16i8:
20432    return MVT::nxv16i8;
20433  }
20434}
20435
20436static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
20437  SDLoc DL(N);
20438  EVT VT = N->getValueType(0);
20439
20440  if (VT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock)
20441    return SDValue();
20442
20443  EVT ContainerVT = VT;
20444  if (ContainerVT.isInteger())
20445    ContainerVT = getSVEContainerType(ContainerVT);
20446
20447  SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
20448  SDValue Ops[] = { N->getOperand(0), // Chain
20449                    N->getOperand(2), // Pg
20450                    N->getOperand(3), // Base
20451                    DAG.getValueType(VT) };
20452
20453  SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
20454  SDValue LoadChain = SDValue(Load.getNode(), 1);
20455
20456  if (ContainerVT.isInteger() && (VT != ContainerVT))
20457    Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
20458
20459  return DAG.getMergeValues({ Load, LoadChain }, DL);
20460}
20461
20462static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) {
20463  SDLoc DL(N);
20464  EVT VT = N->getValueType(0);
20465  EVT PtrTy = N->getOperand(3).getValueType();
20466
20467  EVT LoadVT = VT;
20468  if (VT.isFloatingPoint())
20469    LoadVT = VT.changeTypeToInteger();
20470
20471  auto *MINode = cast<MemIntrinsicSDNode>(N);
20472  SDValue PassThru = DAG.getConstant(0, DL, LoadVT);
20473  SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
20474                                MINode->getOperand(3), DAG.getUNDEF(PtrTy),
20475                                MINode->getOperand(2), PassThru,
20476                                MINode->getMemoryVT(), MINode->getMemOperand(),
20477                                ISD::UNINDEXED, ISD::NON_EXTLOAD, false);
20478
20479   if (VT.isFloatingPoint()) {
20480     SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
20481     return DAG.getMergeValues(Ops, DL);
20482   }
20483
20484  return L;
20485}
20486
20487template <unsigned Opcode>
20488static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) {
20489  static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
20490                    Opcode == AArch64ISD::LD1RO_MERGE_ZERO,
20491                "Unsupported opcode.");
20492  SDLoc DL(N);
20493  EVT VT = N->getValueType(0);
20494
20495  EVT LoadVT = VT;
20496  if (VT.isFloatingPoint())
20497    LoadVT = VT.changeTypeToInteger();
20498
20499  SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
20500  SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
20501  SDValue LoadChain = SDValue(Load.getNode(), 1);
20502
20503  if (VT.isFloatingPoint())
20504    Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));
20505
20506  return DAG.getMergeValues({Load, LoadChain}, DL);
20507}
20508
20509static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) {
20510  SDLoc DL(N);
20511  SDValue Data = N->getOperand(2);
20512  EVT DataVT = Data.getValueType();
20513  EVT HwSrcVt = getSVEContainerType(DataVT);
20514  SDValue InputVT = DAG.getValueType(DataVT);
20515
20516  if (DataVT.isFloatingPoint())
20517    InputVT = DAG.getValueType(HwSrcVt);
20518
20519  SDValue SrcNew;
20520  if (Data.getValueType().isFloatingPoint())
20521    SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data);
20522  else
20523    SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);
20524
20525  SDValue Ops[] = { N->getOperand(0), // Chain
20526                    SrcNew,
20527                    N->getOperand(4), // Base
20528                    N->getOperand(3), // Pg
20529                    InputVT
20530                  };
20531
20532  return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
20533}
20534
20535static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) {
20536  SDLoc DL(N);
20537
20538  SDValue Data = N->getOperand(2);
20539  EVT DataVT = Data.getValueType();
20540  EVT PtrTy = N->getOperand(4).getValueType();
20541
20542  if (DataVT.isFloatingPoint())
20543    Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data);
20544
20545  auto *MINode = cast<MemIntrinsicSDNode>(N);
20546  return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),
20547                            DAG.getUNDEF(PtrTy), MINode->getOperand(3),
20548                            MINode->getMemoryVT(), MINode->getMemOperand(),
20549                            ISD::UNINDEXED, false, false);
20550}
20551
20552/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.  The
20553/// load store optimizer pass will merge them to store pair stores.  This should
20554/// be better than a movi to create the vector zero followed by a vector store
20555/// if the zero constant is not re-used, since one instructions and one register
20556/// live range will be removed.
20557///
20558/// For example, the final generated code should be:
20559///
20560///   stp xzr, xzr, [x0]
20561///
20562/// instead of:
20563///
20564///   movi v0.2d, #0
20565///   str q0, [x0]
20566///
20567static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
20568  SDValue StVal = St.getValue();
20569  EVT VT = StVal.getValueType();
20570
20571  // Avoid scalarizing zero splat stores for scalable vectors.
20572  if (VT.isScalableVector())
20573    return SDValue();
20574
20575  // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
20576  // 2, 3 or 4 i32 elements.
20577  int NumVecElts = VT.getVectorNumElements();
20578  if (!(((NumVecElts == 2 || NumVecElts == 3) &&
20579         VT.getVectorElementType().getSizeInBits() == 64) ||
20580        ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
20581         VT.getVectorElementType().getSizeInBits() == 32)))
20582    return SDValue();
20583
20584  if (StVal.getOpcode() != ISD::BUILD_VECTOR)
20585    return SDValue();
20586
20587  // If the zero constant has more than one use then the vector store could be
20588  // better since the constant mov will be amortized and stp q instructions
20589  // should be able to be formed.
20590  if (!StVal.hasOneUse())
20591    return SDValue();
20592
20593  // If the store is truncating then it's going down to i16 or smaller, which
20594  // means it can be implemented in a single store anyway.
20595  if (St.isTruncatingStore())
20596    return SDValue();
20597
20598  // If the immediate offset of the address operand is too large for the stp
20599  // instruction, then bail out.
20600  if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
20601    int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
20602    if (Offset < -512 || Offset > 504)
20603      return SDValue();
20604  }
20605
20606  for (int I = 0; I < NumVecElts; ++I) {
20607    SDValue EltVal = StVal.getOperand(I);
20608    if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
20609      return SDValue();
20610  }
20611
20612  // Use a CopyFromReg WZR/XZR here to prevent
20613  // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
20614  SDLoc DL(&St);
20615  unsigned ZeroReg;
20616  EVT ZeroVT;
20617  if (VT.getVectorElementType().getSizeInBits() == 32) {
20618    ZeroReg = AArch64::WZR;
20619    ZeroVT = MVT::i32;
20620  } else {
20621    ZeroReg = AArch64::XZR;
20622    ZeroVT = MVT::i64;
20623  }
20624  SDValue SplatVal =
20625      DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
20626  return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
20627}
20628
20629/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
20630/// value. The load store optimizer pass will merge them to store pair stores.
20631/// This has better performance than a splat of the scalar followed by a split
20632/// vector store. Even if the stores are not merged it is four stores vs a dup,
20633/// followed by an ext.b and two stores.
20634static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
20635  SDValue StVal = St.getValue();
20636  EVT VT = StVal.getValueType();
20637
20638  // Don't replace floating point stores, they possibly won't be transformed to
20639  // stp because of the store pair suppress pass.
20640  if (VT.isFloatingPoint())
20641    return SDValue();
20642
20643  // We can express a splat as store pair(s) for 2 or 4 elements.
20644  unsigned NumVecElts = VT.getVectorNumElements();
20645  if (NumVecElts != 4 && NumVecElts != 2)
20646    return SDValue();
20647
20648  // If the store is truncating then it's going down to i16 or smaller, which
20649  // means it can be implemented in a single store anyway.
20650  if (St.isTruncatingStore())
20651    return SDValue();
20652
20653  // Check that this is a splat.
20654  // Make sure that each of the relevant vector element locations are inserted
20655  // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
20656  std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
20657  SDValue SplatVal;
20658  for (unsigned I = 0; I < NumVecElts; ++I) {
20659    // Check for insert vector elements.
20660    if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
20661      return SDValue();
20662
20663    // Check that same value is inserted at each vector element.
20664    if (I == 0)
20665      SplatVal = StVal.getOperand(1);
20666    else if (StVal.getOperand(1) != SplatVal)
20667      return SDValue();
20668
20669    // Check insert element index.
20670    ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));
20671    if (!CIndex)
20672      return SDValue();
20673    uint64_t IndexVal = CIndex->getZExtValue();
20674    if (IndexVal >= NumVecElts)
20675      return SDValue();
20676    IndexNotInserted.reset(IndexVal);
20677
20678    StVal = StVal.getOperand(0);
20679  }
20680  // Check that all vector element locations were inserted to.
20681  if (IndexNotInserted.any())
20682      return SDValue();
20683
20684  return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
20685}
20686
20687static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
20688                           SelectionDAG &DAG,
20689                           const AArch64Subtarget *Subtarget) {
20690
20691  StoreSDNode *S = cast<StoreSDNode>(N);
20692  if (S->isVolatile() || S->isIndexed())
20693    return SDValue();
20694
20695  SDValue StVal = S->getValue();
20696  EVT VT = StVal.getValueType();
20697
20698  if (!VT.isFixedLengthVector())
20699    return SDValue();
20700
20701  // If we get a splat of zeros, convert this vector store to a store of
20702  // scalars. They will be merged into store pairs of xzr thereby removing one
20703  // instruction and one register.
20704  if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
20705    return ReplacedZeroSplat;
20706
20707  // FIXME: The logic for deciding if an unaligned store should be split should
20708  // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
20709  // a call to that function here.
20710
20711  if (!Subtarget->isMisaligned128StoreSlow())
20712    return SDValue();
20713
20714  // Don't split at -Oz.
20715  if (DAG.getMachineFunction().getFunction().hasMinSize())
20716    return SDValue();
20717
20718  // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
20719  // those up regresses performance on micro-benchmarks and olden/bh.
20720  if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
20721    return SDValue();
20722
20723  // Split unaligned 16B stores. They are terrible for performance.
20724  // Don't split stores with alignment of 1 or 2. Code that uses clang vector
20725  // extensions can use this to mark that it does not want splitting to happen
20726  // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
20727  // eliminating alignment hazards is only 1 in 8 for alignment of 2.
20728  if (VT.getSizeInBits() != 128 || S->getAlign() >= Align(16) ||
20729      S->getAlign() <= Align(2))
20730    return SDValue();
20731
20732  // If we get a splat of a scalar convert this vector store to a store of
20733  // scalars. They will be merged into store pairs thereby removing two
20734  // instructions.
20735  if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
20736    return ReplacedSplat;
20737
20738  SDLoc DL(S);
20739
20740  // Split VT into two.
20741  EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
20742  unsigned NumElts = HalfVT.getVectorNumElements();
20743  SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
20744                                   DAG.getConstant(0, DL, MVT::i64));
20745  SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
20746                                   DAG.getConstant(NumElts, DL, MVT::i64));
20747  SDValue BasePtr = S->getBasePtr();
20748  SDValue NewST1 =
20749      DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
20750                   S->getAlign(), S->getMemOperand()->getFlags());
20751  SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
20752                                  DAG.getConstant(8, DL, MVT::i64));
20753  return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
20754                      S->getPointerInfo(), S->getAlign(),
20755                      S->getMemOperand()->getFlags());
20756}
20757
20758static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG) {
20759  assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexepected Opcode!");
20760
20761  // splice(pg, op1, undef) -> op1
20762  if (N->getOperand(2).isUndef())
20763    return N->getOperand(1);
20764
20765  return SDValue();
20766}
20767
20768static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG,
20769                                    const AArch64Subtarget *Subtarget) {
20770  assert((N->getOpcode() == AArch64ISD::UUNPKHI ||
20771          N->getOpcode() == AArch64ISD::UUNPKLO) &&
20772         "Unexpected Opcode!");
20773
20774  // uunpklo/hi undef -> undef
20775  if (N->getOperand(0).isUndef())
20776    return DAG.getUNDEF(N->getValueType(0));
20777
20778  // If this is a masked load followed by an UUNPKLO, fold this into a masked
20779  // extending load.  We can do this even if this is already a masked
20780  // {z,}extload.
20781  if (N->getOperand(0).getOpcode() == ISD::MLOAD &&
20782      N->getOpcode() == AArch64ISD::UUNPKLO) {
20783    MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N->getOperand(0));
20784    SDValue Mask = MLD->getMask();
20785    SDLoc DL(N);
20786
20787    if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD &&
20788        SDValue(MLD, 0).hasOneUse() && Mask->getOpcode() == AArch64ISD::PTRUE &&
20789        (MLD->getPassThru()->isUndef() ||
20790         isZerosVector(MLD->getPassThru().getNode()))) {
20791      unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
20792      unsigned PgPattern = Mask->getConstantOperandVal(0);
20793      EVT VT = N->getValueType(0);
20794
20795      // Ensure we can double the size of the predicate pattern
20796      unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
20797      if (NumElts &&
20798          NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) {
20799        Mask =
20800            getPTrue(DAG, DL, VT.changeVectorElementType(MVT::i1), PgPattern);
20801        SDValue PassThru = DAG.getConstant(0, DL, VT);
20802        SDValue NewLoad = DAG.getMaskedLoad(
20803            VT, DL, MLD->getChain(), MLD->getBasePtr(), MLD->getOffset(), Mask,
20804            PassThru, MLD->getMemoryVT(), MLD->getMemOperand(),
20805            MLD->getAddressingMode(), ISD::ZEXTLOAD);
20806
20807        DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), NewLoad.getValue(1));
20808
20809        return NewLoad;
20810      }
20811    }
20812  }
20813
20814  return SDValue();
20815}
20816
20817// Try to simplify:
20818//    t1 = nxv8i16 add(X, 1 << (ShiftValue - 1))
20819//    t2 = nxv8i16 srl(t1, ShiftValue)
20820// to
20821//    t1 = nxv8i16 rshrnb(X, shiftvalue).
20822// rshrnb will zero the top half bits of each element. Therefore, this combine
20823// should only be performed when a following instruction with the rshrnb
20824// as an operand does not care about the top half of each element. For example,
20825// a uzp1 or a truncating store.
20826static SDValue trySimplifySrlAddToRshrnb(SDValue Srl, SelectionDAG &DAG,
20827                                         const AArch64Subtarget *Subtarget) {
20828  EVT VT = Srl->getValueType(0);
20829
20830  if (!VT.isScalableVector() || !Subtarget->hasSVE2() ||
20831      Srl->getOpcode() != ISD::SRL)
20832    return SDValue();
20833
20834  EVT ResVT;
20835  if (VT == MVT::nxv8i16)
20836    ResVT = MVT::nxv16i8;
20837  else if (VT == MVT::nxv4i32)
20838    ResVT = MVT::nxv8i16;
20839  else if (VT == MVT::nxv2i64)
20840    ResVT = MVT::nxv4i32;
20841  else
20842    return SDValue();
20843
20844  auto SrlOp1 =
20845      dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Srl->getOperand(1)));
20846  if (!SrlOp1)
20847    return SDValue();
20848  unsigned ShiftValue = SrlOp1->getZExtValue();
20849  if (ShiftValue < 1 || ShiftValue > ResVT.getScalarSizeInBits())
20850    return SDValue();
20851
20852  SDValue Add = Srl->getOperand(0);
20853  if (Add->getOpcode() != ISD::ADD || !Add->hasOneUse())
20854    return SDValue();
20855  auto AddOp1 =
20856      dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Add->getOperand(1)));
20857  if (!AddOp1)
20858    return SDValue();
20859  uint64_t AddValue = AddOp1->getZExtValue();
20860  if (AddValue != 1ULL << (ShiftValue - 1))
20861    return SDValue();
20862
20863  SDLoc DL(Srl);
20864  SDValue Rshrnb = DAG.getNode(
20865      AArch64ISD::RSHRNB_I, DL, ResVT,
20866      {Add->getOperand(0), DAG.getTargetConstant(ShiftValue, DL, MVT::i32)});
20867  return DAG.getNode(ISD::BITCAST, DL, VT, Rshrnb);
20868}
20869
20870static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG,
20871                                 const AArch64Subtarget *Subtarget) {
20872  SDLoc DL(N);
20873  SDValue Op0 = N->getOperand(0);
20874  SDValue Op1 = N->getOperand(1);
20875  EVT ResVT = N->getValueType(0);
20876
20877  // uzp1(x, undef) -> concat(truncate(x), undef)
20878  if (Op1.getOpcode() == ISD::UNDEF) {
20879    EVT BCVT = MVT::Other, HalfVT = MVT::Other;
20880    switch (ResVT.getSimpleVT().SimpleTy) {
20881    default:
20882      break;
20883    case MVT::v16i8:
20884      BCVT = MVT::v8i16;
20885      HalfVT = MVT::v8i8;
20886      break;
20887    case MVT::v8i16:
20888      BCVT = MVT::v4i32;
20889      HalfVT = MVT::v4i16;
20890      break;
20891    case MVT::v4i32:
20892      BCVT = MVT::v2i64;
20893      HalfVT = MVT::v2i32;
20894      break;
20895    }
20896    if (BCVT != MVT::Other) {
20897      SDValue BC = DAG.getBitcast(BCVT, Op0);
20898      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
20899      return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
20900                         DAG.getUNDEF(HalfVT));
20901    }
20902  }
20903
20904  if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Op0, DAG, Subtarget))
20905    return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Rshrnb, Op1);
20906
20907  if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Op1, DAG, Subtarget))
20908    return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Rshrnb);
20909
20910  // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z)
20911  if (Op0.getOpcode() == AArch64ISD::UUNPKLO) {
20912    if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
20913      SDValue X = Op0.getOperand(0).getOperand(0);
20914      return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);
20915    }
20916  }
20917
20918  // uzp1(x, unpkhi(uzp1(y, z))) => uzp1(x, z)
20919  if (Op1.getOpcode() == AArch64ISD::UUNPKHI) {
20920    if (Op1.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
20921      SDValue Z = Op1.getOperand(0).getOperand(1);
20922      return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);
20923    }
20924  }
20925
20926  // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
20927  // Only implemented on little-endian subtargets.
20928  bool IsLittleEndian = DAG.getDataLayout().isLittleEndian();
20929
20930  // This optimization only works on little endian.
20931  if (!IsLittleEndian)
20932    return SDValue();
20933
20934  if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
20935    return SDValue();
20936
20937  auto getSourceOp = [](SDValue Operand) -> SDValue {
20938    const unsigned Opcode = Operand.getOpcode();
20939    if (Opcode == ISD::TRUNCATE)
20940      return Operand->getOperand(0);
20941    if (Opcode == ISD::BITCAST &&
20942        Operand->getOperand(0).getOpcode() == ISD::TRUNCATE)
20943      return Operand->getOperand(0)->getOperand(0);
20944    return SDValue();
20945  };
20946
20947  SDValue SourceOp0 = getSourceOp(Op0);
20948  SDValue SourceOp1 = getSourceOp(Op1);
20949
20950  if (!SourceOp0 || !SourceOp1)
20951    return SDValue();
20952
20953  if (SourceOp0.getValueType() != SourceOp1.getValueType() ||
20954      !SourceOp0.getValueType().isSimple())
20955    return SDValue();
20956
20957  EVT ResultTy;
20958
20959  switch (SourceOp0.getSimpleValueType().SimpleTy) {
20960  case MVT::v2i64:
20961    ResultTy = MVT::v4i32;
20962    break;
20963  case MVT::v4i32:
20964    ResultTy = MVT::v8i16;
20965    break;
20966  case MVT::v8i16:
20967    ResultTy = MVT::v16i8;
20968    break;
20969  default:
20970    return SDValue();
20971  }
20972
20973  SDValue UzpOp0 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp0);
20974  SDValue UzpOp1 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp1);
20975  SDValue UzpResult =
20976      DAG.getNode(AArch64ISD::UZP1, DL, UzpOp0.getValueType(), UzpOp0, UzpOp1);
20977
20978  EVT BitcastResultTy;
20979
20980  switch (ResVT.getSimpleVT().SimpleTy) {
20981  case MVT::v2i32:
20982    BitcastResultTy = MVT::v2i64;
20983    break;
20984  case MVT::v4i16:
20985    BitcastResultTy = MVT::v4i32;
20986    break;
20987  case MVT::v8i8:
20988    BitcastResultTy = MVT::v8i16;
20989    break;
20990  default:
20991    llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}");
20992  }
20993
20994  return DAG.getNode(ISD::TRUNCATE, DL, ResVT,
20995                     DAG.getNode(ISD::BITCAST, DL, BitcastResultTy, UzpResult));
20996}
20997
20998static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG) {
20999  unsigned Opc = N->getOpcode();
21000
21001  assert(((Opc >= AArch64ISD::GLD1_MERGE_ZERO && // unsigned gather loads
21002           Opc <= AArch64ISD::GLD1_IMM_MERGE_ZERO) ||
21003          (Opc >= AArch64ISD::GLD1S_MERGE_ZERO && // signed gather loads
21004           Opc <= AArch64ISD::GLD1S_IMM_MERGE_ZERO)) &&
21005         "Invalid opcode.");
21006
21007  const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO ||
21008                      Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
21009  const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO ||
21010                      Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
21011  const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO ||
21012                        Opc == AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO ||
21013                        Opc == AArch64ISD::GLD1_UXTW_MERGE_ZERO ||
21014                        Opc == AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO;
21015
21016  SDLoc DL(N);
21017  SDValue Chain = N->getOperand(0);
21018  SDValue Pg = N->getOperand(1);
21019  SDValue Base = N->getOperand(2);
21020  SDValue Offset = N->getOperand(3);
21021  SDValue Ty = N->getOperand(4);
21022
21023  EVT ResVT = N->getValueType(0);
21024
21025  const auto OffsetOpc = Offset.getOpcode();
21026  const bool OffsetIsZExt =
21027      OffsetOpc == AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU;
21028  const bool OffsetIsSExt =
21029      OffsetOpc == AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU;
21030
21031  // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
21032  if (!Extended && (OffsetIsSExt || OffsetIsZExt)) {
21033    SDValue ExtPg = Offset.getOperand(0);
21034    VTSDNode *ExtFrom = cast<VTSDNode>(Offset.getOperand(2).getNode());
21035    EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
21036
21037    // If the predicate for the sign- or zero-extended offset is the
21038    // same as the predicate used for this load and the sign-/zero-extension
21039    // was from a 32-bits...
21040    if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
21041      SDValue UnextendedOffset = Offset.getOperand(1);
21042
21043      unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true);
21044      if (Signed)
21045        NewOpc = getSignExtendedGatherOpcode(NewOpc);
21046
21047      return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other},
21048                         {Chain, Pg, Base, UnextendedOffset, Ty});
21049    }
21050  }
21051
21052  return SDValue();
21053}
21054
21055/// Optimize a vector shift instruction and its operand if shifted out
21056/// bits are not used.
21057static SDValue performVectorShiftCombine(SDNode *N,
21058                                         const AArch64TargetLowering &TLI,
21059                                         TargetLowering::DAGCombinerInfo &DCI) {
21060  assert(N->getOpcode() == AArch64ISD::VASHR ||
21061         N->getOpcode() == AArch64ISD::VLSHR);
21062
21063  SDValue Op = N->getOperand(0);
21064  unsigned OpScalarSize = Op.getScalarValueSizeInBits();
21065
21066  unsigned ShiftImm = N->getConstantOperandVal(1);
21067  assert(OpScalarSize > ShiftImm && "Invalid shift imm");
21068
21069  // Remove sign_extend_inreg (ashr(shl(x)) based on the number of sign bits.
21070  if (N->getOpcode() == AArch64ISD::VASHR &&
21071      Op.getOpcode() == AArch64ISD::VSHL &&
21072      N->getOperand(1) == Op.getOperand(1))
21073    if (DCI.DAG.ComputeNumSignBits(Op.getOperand(0)) > ShiftImm)
21074      return Op.getOperand(0);
21075
21076  APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm);
21077  APInt DemandedMask = ~ShiftedOutBits;
21078
21079  if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
21080    return SDValue(N, 0);
21081
21082  return SDValue();
21083}
21084
21085static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG) {
21086  // sunpklo(sext(pred)) -> sext(extract_low_half(pred))
21087  // This transform works in partnership with performSetCCPunpkCombine to
21088  // remove unnecessary transfer of predicates into standard registers and back
21089  if (N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND &&
21090      N->getOperand(0)->getOperand(0)->getValueType(0).getScalarType() ==
21091          MVT::i1) {
21092    SDValue CC = N->getOperand(0)->getOperand(0);
21093    auto VT = CC->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
21094    SDValue Unpk = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, CC,
21095                               DAG.getVectorIdxConstant(0, SDLoc(N)));
21096    return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), N->getValueType(0), Unpk);
21097  }
21098
21099  return SDValue();
21100}
21101
21102/// Target-specific DAG combine function for post-increment LD1 (lane) and
21103/// post-increment LD1R.
21104static SDValue performPostLD1Combine(SDNode *N,
21105                                     TargetLowering::DAGCombinerInfo &DCI,
21106                                     bool IsLaneOp) {
21107  if (DCI.isBeforeLegalizeOps())
21108    return SDValue();
21109
21110  SelectionDAG &DAG = DCI.DAG;
21111  EVT VT = N->getValueType(0);
21112
21113  if (!VT.is128BitVector() && !VT.is64BitVector())
21114    return SDValue();
21115
21116  unsigned LoadIdx = IsLaneOp ? 1 : 0;
21117  SDNode *LD = N->getOperand(LoadIdx).getNode();
21118  // If it is not LOAD, can not do such combine.
21119  if (LD->getOpcode() != ISD::LOAD)
21120    return SDValue();
21121
21122  // The vector lane must be a constant in the LD1LANE opcode.
21123  SDValue Lane;
21124  if (IsLaneOp) {
21125    Lane = N->getOperand(2);
21126    auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
21127    if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
21128      return SDValue();
21129  }
21130
21131  LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
21132  EVT MemVT = LoadSDN->getMemoryVT();
21133  // Check if memory operand is the same type as the vector element.
21134  if (MemVT != VT.getVectorElementType())
21135    return SDValue();
21136
21137  // Check if there are other uses. If so, do not combine as it will introduce
21138  // an extra load.
21139  for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
21140       ++UI) {
21141    if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
21142      continue;
21143    if (*UI != N)
21144      return SDValue();
21145  }
21146
21147  // If there is one use and it can splat the value, prefer that operation.
21148  // TODO: This could be expanded to more operations if they reliably use the
21149  // index variants.
21150  if (N->hasOneUse()) {
21151    unsigned UseOpc = N->use_begin()->getOpcode();
21152    if (UseOpc == ISD::FMUL || UseOpc == ISD::FMA)
21153      return SDValue();
21154  }
21155
21156  SDValue Addr = LD->getOperand(1);
21157  SDValue Vector = N->getOperand(0);
21158  // Search for a use of the address operand that is an increment.
21159  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
21160       Addr.getNode()->use_end(); UI != UE; ++UI) {
21161    SDNode *User = *UI;
21162    if (User->getOpcode() != ISD::ADD
21163        || UI.getUse().getResNo() != Addr.getResNo())
21164      continue;
21165
21166    // If the increment is a constant, it must match the memory ref size.
21167    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
21168    if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
21169      uint32_t IncVal = CInc->getZExtValue();
21170      unsigned NumBytes = VT.getScalarSizeInBits() / 8;
21171      if (IncVal != NumBytes)
21172        continue;
21173      Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
21174    }
21175
21176    // To avoid cycle construction make sure that neither the load nor the add
21177    // are predecessors to each other or the Vector.
21178    SmallPtrSet<const SDNode *, 32> Visited;
21179    SmallVector<const SDNode *, 16> Worklist;
21180    Visited.insert(Addr.getNode());
21181    Worklist.push_back(User);
21182    Worklist.push_back(LD);
21183    Worklist.push_back(Vector.getNode());
21184    if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) ||
21185        SDNode::hasPredecessorHelper(User, Visited, Worklist))
21186      continue;
21187
21188    SmallVector<SDValue, 8> Ops;
21189    Ops.push_back(LD->getOperand(0));  // Chain
21190    if (IsLaneOp) {
21191      Ops.push_back(Vector);           // The vector to be inserted
21192      Ops.push_back(Lane);             // The lane to be inserted in the vector
21193    }
21194    Ops.push_back(Addr);
21195    Ops.push_back(Inc);
21196
21197    EVT Tys[3] = { VT, MVT::i64, MVT::Other };
21198    SDVTList SDTys = DAG.getVTList(Tys);
21199    unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
21200    SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
21201                                           MemVT,
21202                                           LoadSDN->getMemOperand());
21203
21204    // Update the uses.
21205    SDValue NewResults[] = {
21206        SDValue(LD, 0),            // The result of load
21207        SDValue(UpdN.getNode(), 2) // Chain
21208    };
21209    DCI.CombineTo(LD, NewResults);
21210    DCI.CombineTo(N, SDValue(UpdN.getNode(), 0));     // Dup/Inserted Result
21211    DCI.CombineTo(User, SDValue(UpdN.getNode(), 1));  // Write back register
21212
21213    break;
21214  }
21215  return SDValue();
21216}
21217
21218/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
21219/// address translation.
21220static bool performTBISimplification(SDValue Addr,
21221                                     TargetLowering::DAGCombinerInfo &DCI,
21222                                     SelectionDAG &DAG) {
21223  APInt DemandedMask = APInt::getLowBitsSet(64, 56);
21224  KnownBits Known;
21225  TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
21226                                        !DCI.isBeforeLegalizeOps());
21227  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21228  if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
21229    DCI.CommitTargetLoweringOpt(TLO);
21230    return true;
21231  }
21232  return false;
21233}
21234
21235static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) {
21236  assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) &&
21237         "Expected STORE dag node in input!");
21238
21239  if (auto Store = dyn_cast<StoreSDNode>(N)) {
21240    if (!Store->isTruncatingStore() || Store->isIndexed())
21241      return SDValue();
21242    SDValue Ext = Store->getValue();
21243    auto ExtOpCode = Ext.getOpcode();
21244    if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
21245        ExtOpCode != ISD::ANY_EXTEND)
21246      return SDValue();
21247    SDValue Orig = Ext->getOperand(0);
21248    if (Store->getMemoryVT() != Orig.getValueType())
21249      return SDValue();
21250    return DAG.getStore(Store->getChain(), SDLoc(Store), Orig,
21251                        Store->getBasePtr(), Store->getMemOperand());
21252  }
21253
21254  return SDValue();
21255}
21256
21257// Perform TBI simplification if supported by the target and try to break up
21258// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
21259// load instructions can be selected.
21260static SDValue performLOADCombine(SDNode *N,
21261                                  TargetLowering::DAGCombinerInfo &DCI,
21262                                  SelectionDAG &DAG,
21263                                  const AArch64Subtarget *Subtarget) {
21264  if (Subtarget->supportsAddressTopByteIgnored())
21265    performTBISimplification(N->getOperand(1), DCI, DAG);
21266
21267  LoadSDNode *LD = cast<LoadSDNode>(N);
21268  EVT MemVT = LD->getMemoryVT();
21269  if (LD->isVolatile() || !LD->isNonTemporal() || !Subtarget->isLittleEndian())
21270    return SDValue(N, 0);
21271
21272  if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
21273      MemVT.getSizeInBits() % 256 == 0 ||
21274      256 % MemVT.getScalarSizeInBits() != 0)
21275    return SDValue(N, 0);
21276
21277  SDLoc DL(LD);
21278  SDValue Chain = LD->getChain();
21279  SDValue BasePtr = LD->getBasePtr();
21280  SDNodeFlags Flags = LD->getFlags();
21281  SmallVector<SDValue, 4> LoadOps;
21282  SmallVector<SDValue, 4> LoadOpsChain;
21283  // Replace any non temporal load over 256-bit with a series of 256 bit loads
21284  // and a scalar/vector load less than 256. This way we can utilize 256-bit
21285  // loads and reduce the amount of load instructions generated.
21286  MVT NewVT =
21287      MVT::getVectorVT(MemVT.getVectorElementType().getSimpleVT(),
21288                       256 / MemVT.getVectorElementType().getSizeInBits());
21289  unsigned Num256Loads = MemVT.getSizeInBits() / 256;
21290  // Create all 256-bit loads starting from offset 0 and up to Num256Loads-1*32.
21291  for (unsigned I = 0; I < Num256Loads; I++) {
21292    unsigned PtrOffset = I * 32;
21293    SDValue NewPtr = DAG.getMemBasePlusOffset(
21294        BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
21295    Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
21296    SDValue NewLoad = DAG.getLoad(
21297        NewVT, DL, Chain, NewPtr, LD->getPointerInfo().getWithOffset(PtrOffset),
21298        NewAlign, LD->getMemOperand()->getFlags(), LD->getAAInfo());
21299    LoadOps.push_back(NewLoad);
21300    LoadOpsChain.push_back(SDValue(cast<SDNode>(NewLoad), 1));
21301  }
21302
21303  // Process remaining bits of the load operation.
21304  // This is done by creating an UNDEF vector to match the size of the
21305  // 256-bit loads and inserting the remaining load to it. We extract the
21306  // original load type at the end using EXTRACT_SUBVECTOR instruction.
21307  unsigned BitsRemaining = MemVT.getSizeInBits() % 256;
21308  unsigned PtrOffset = (MemVT.getSizeInBits() - BitsRemaining) / 8;
21309  MVT RemainingVT = MVT::getVectorVT(
21310      MemVT.getVectorElementType().getSimpleVT(),
21311      BitsRemaining / MemVT.getVectorElementType().getSizeInBits());
21312  SDValue NewPtr = DAG.getMemBasePlusOffset(
21313      BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
21314  Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
21315  SDValue RemainingLoad =
21316      DAG.getLoad(RemainingVT, DL, Chain, NewPtr,
21317                  LD->getPointerInfo().getWithOffset(PtrOffset), NewAlign,
21318                  LD->getMemOperand()->getFlags(), LD->getAAInfo());
21319  SDValue UndefVector = DAG.getUNDEF(NewVT);
21320  SDValue InsertIdx = DAG.getVectorIdxConstant(0, DL);
21321  SDValue ExtendedReminingLoad =
21322      DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT,
21323                  {UndefVector, RemainingLoad, InsertIdx});
21324  LoadOps.push_back(ExtendedReminingLoad);
21325  LoadOpsChain.push_back(SDValue(cast<SDNode>(RemainingLoad), 1));
21326  EVT ConcatVT =
21327      EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
21328                       LoadOps.size() * NewVT.getVectorNumElements());
21329  SDValue ConcatVectors =
21330      DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, LoadOps);
21331  // Extract the original vector type size.
21332  SDValue ExtractSubVector =
21333      DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT,
21334                  {ConcatVectors, DAG.getVectorIdxConstant(0, DL)});
21335  SDValue TokenFactor =
21336      DAG.getNode(ISD::TokenFactor, DL, MVT::Other, LoadOpsChain);
21337  return DAG.getMergeValues({ExtractSubVector, TokenFactor}, DL);
21338}
21339
21340static EVT tryGetOriginalBoolVectorType(SDValue Op, int Depth = 0) {
21341  EVT VecVT = Op.getValueType();
21342  assert(VecVT.isVector() && VecVT.getVectorElementType() == MVT::i1 &&
21343         "Need boolean vector type.");
21344
21345  if (Depth > 3)
21346    return MVT::INVALID_SIMPLE_VALUE_TYPE;
21347
21348  // We can get the base type from a vector compare or truncate.
21349  if (Op.getOpcode() == ISD::SETCC || Op.getOpcode() == ISD::TRUNCATE)
21350    return Op.getOperand(0).getValueType();
21351
21352  // If an operand is a bool vector, continue looking.
21353  EVT BaseVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
21354  for (SDValue Operand : Op->op_values()) {
21355    if (Operand.getValueType() != VecVT)
21356      continue;
21357
21358    EVT OperandVT = tryGetOriginalBoolVectorType(Operand, Depth + 1);
21359    if (!BaseVT.isSimple())
21360      BaseVT = OperandVT;
21361    else if (OperandVT != BaseVT)
21362      return MVT::INVALID_SIMPLE_VALUE_TYPE;
21363  }
21364
21365  return BaseVT;
21366}
21367
21368// When converting a <N x iX> vector to <N x i1> to store or use as a scalar
21369// iN, we can use a trick that extracts the i^th bit from the i^th element and
21370// then performs a vector add to get a scalar bitmask. This requires that each
21371// element's bits are either all 1 or all 0.
21372static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) {
21373  SDLoc DL(N);
21374  SDValue ComparisonResult(N, 0);
21375  EVT VecVT = ComparisonResult.getValueType();
21376  assert(VecVT.isVector() && "Must be a vector type");
21377
21378  unsigned NumElts = VecVT.getVectorNumElements();
21379  if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
21380    return SDValue();
21381
21382  if (VecVT.getVectorElementType() != MVT::i1 &&
21383      !DAG.getTargetLoweringInfo().isTypeLegal(VecVT))
21384    return SDValue();
21385
21386  // If we can find the original types to work on instead of a vector of i1,
21387  // we can avoid extend/extract conversion instructions.
21388  if (VecVT.getVectorElementType() == MVT::i1) {
21389    VecVT = tryGetOriginalBoolVectorType(ComparisonResult);
21390    if (!VecVT.isSimple()) {
21391      unsigned BitsPerElement = std::max(64 / NumElts, 8u); // >= 64-bit vector
21392      VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), NumElts);
21393    }
21394  }
21395  VecVT = VecVT.changeVectorElementTypeToInteger();
21396
21397  // Large vectors don't map directly to this conversion, so to avoid too many
21398  // edge cases, we don't apply it here. The conversion will likely still be
21399  // applied later via multiple smaller vectors, whose results are concatenated.
21400  if (VecVT.getSizeInBits() > 128)
21401    return SDValue();
21402
21403  // Ensure that all elements' bits are either 0s or 1s.
21404  ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT);
21405
21406  SmallVector<SDValue, 16> MaskConstants;
21407  if (VecVT == MVT::v16i8) {
21408    // v16i8 is a special case, as we have 16 entries but only 8 positional bits
21409    // per entry. We split it into two halves, apply the mask, zip the halves to
21410    // create 8x 16-bit values, and the perform the vector reduce.
21411    for (unsigned Half = 0; Half < 2; ++Half) {
21412      for (unsigned MaskBit = 1; MaskBit <= 128; MaskBit *= 2) {
21413        MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i32));
21414      }
21415    }
21416    SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
21417    SDValue RepresentativeBits =
21418        DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
21419
21420    SDValue UpperRepresentativeBits =
21421        DAG.getNode(AArch64ISD::EXT, DL, VecVT, RepresentativeBits,
21422                    RepresentativeBits, DAG.getConstant(8, DL, MVT::i32));
21423    SDValue Zipped = DAG.getNode(AArch64ISD::ZIP1, DL, VecVT,
21424                                 RepresentativeBits, UpperRepresentativeBits);
21425    Zipped = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Zipped);
21426    return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i16, Zipped);
21427  }
21428
21429  // All other vector sizes.
21430  unsigned MaxBitMask = 1u << (VecVT.getVectorNumElements() - 1);
21431  for (unsigned MaskBit = 1; MaskBit <= MaxBitMask; MaskBit *= 2) {
21432    MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i64));
21433  }
21434
21435  SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
21436  SDValue RepresentativeBits =
21437      DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
21438  EVT ResultVT = MVT::getIntegerVT(std::max<unsigned>(
21439      NumElts, VecVT.getVectorElementType().getSizeInBits()));
21440  return DAG.getNode(ISD::VECREDUCE_ADD, DL, ResultVT, RepresentativeBits);
21441}
21442
21443static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG,
21444                                                 StoreSDNode *Store) {
21445  if (!Store->isTruncatingStore())
21446    return SDValue();
21447
21448  SDLoc DL(Store);
21449  SDValue VecOp = Store->getValue();
21450  EVT VT = VecOp.getValueType();
21451  EVT MemVT = Store->getMemoryVT();
21452
21453  if (!MemVT.isVector() || !VT.isVector() ||
21454      MemVT.getVectorElementType() != MVT::i1)
21455    return SDValue();
21456
21457  // If we are storing a vector that we are currently building, let
21458  // `scalarizeVectorStore()` handle this more efficiently.
21459  if (VecOp.getOpcode() == ISD::BUILD_VECTOR)
21460    return SDValue();
21461
21462  VecOp = DAG.getNode(ISD::TRUNCATE, DL, MemVT, VecOp);
21463  SDValue VectorBits = vectorToScalarBitmask(VecOp.getNode(), DAG);
21464  if (!VectorBits)
21465    return SDValue();
21466
21467  EVT StoreVT =
21468      EVT::getIntegerVT(*DAG.getContext(), MemVT.getStoreSizeInBits());
21469  SDValue ExtendedBits = DAG.getZExtOrTrunc(VectorBits, DL, StoreVT);
21470  return DAG.getStore(Store->getChain(), DL, ExtendedBits, Store->getBasePtr(),
21471                      Store->getMemOperand());
21472}
21473
21474bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) {
21475  return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) ||
21476         (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) ||
21477         (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32);
21478}
21479
21480static SDValue performSTORECombine(SDNode *N,
21481                                   TargetLowering::DAGCombinerInfo &DCI,
21482                                   SelectionDAG &DAG,
21483                                   const AArch64Subtarget *Subtarget) {
21484  StoreSDNode *ST = cast<StoreSDNode>(N);
21485  SDValue Chain = ST->getChain();
21486  SDValue Value = ST->getValue();
21487  SDValue Ptr = ST->getBasePtr();
21488  EVT ValueVT = Value.getValueType();
21489
21490  auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
21491    EVT EltVT = VT.getVectorElementType();
21492    return EltVT == MVT::f32 || EltVT == MVT::f64;
21493  };
21494
21495  // If this is an FP_ROUND followed by a store, fold this into a truncating
21496  // store. We can do this even if this is already a truncstore.
21497  // We purposefully don't care about legality of the nodes here as we know
21498  // they can be split down into something legal.
21499  if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
21500      Value.getNode()->hasOneUse() && ST->isUnindexed() &&
21501      Subtarget->useSVEForFixedLengthVectors() &&
21502      ValueVT.isFixedLengthVector() &&
21503      ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() &&
21504      hasValidElementTypeForFPTruncStore(Value.getOperand(0).getValueType()))
21505    return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
21506                             ST->getMemoryVT(), ST->getMemOperand());
21507
21508  if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
21509    return Split;
21510
21511  if (Subtarget->supportsAddressTopByteIgnored() &&
21512      performTBISimplification(N->getOperand(2), DCI, DAG))
21513    return SDValue(N, 0);
21514
21515  if (SDValue Store = foldTruncStoreOfExt(DAG, N))
21516    return Store;
21517
21518  if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, ST))
21519    return Store;
21520
21521  if (ST->isTruncatingStore()) {
21522    EVT StoreVT = ST->getMemoryVT();
21523    if (!isHalvingTruncateOfLegalScalableType(ValueVT, StoreVT))
21524      return SDValue();
21525    if (SDValue Rshrnb =
21526            trySimplifySrlAddToRshrnb(ST->getOperand(1), DAG, Subtarget)) {
21527      return DAG.getTruncStore(ST->getChain(), ST, Rshrnb, ST->getBasePtr(),
21528                               StoreVT, ST->getMemOperand());
21529    }
21530  }
21531
21532  return SDValue();
21533}
21534
21535static SDValue performMSTORECombine(SDNode *N,
21536                                    TargetLowering::DAGCombinerInfo &DCI,
21537                                    SelectionDAG &DAG,
21538                                    const AArch64Subtarget *Subtarget) {
21539  MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
21540  SDValue Value = MST->getValue();
21541  SDValue Mask = MST->getMask();
21542  SDLoc DL(N);
21543
21544  // If this is a UZP1 followed by a masked store, fold this into a masked
21545  // truncating store.  We can do this even if this is already a masked
21546  // truncstore.
21547  if (Value.getOpcode() == AArch64ISD::UZP1 && Value->hasOneUse() &&
21548      MST->isUnindexed() && Mask->getOpcode() == AArch64ISD::PTRUE &&
21549      Value.getValueType().isInteger()) {
21550    Value = Value.getOperand(0);
21551    if (Value.getOpcode() == ISD::BITCAST) {
21552      EVT HalfVT =
21553          Value.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
21554      EVT InVT = Value.getOperand(0).getValueType();
21555
21556      if (HalfVT.widenIntegerVectorElementType(*DAG.getContext()) == InVT) {
21557        unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
21558        unsigned PgPattern = Mask->getConstantOperandVal(0);
21559
21560        // Ensure we can double the size of the predicate pattern
21561        unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
21562        if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <=
21563                           MinSVESize) {
21564          Mask = getPTrue(DAG, DL, InVT.changeVectorElementType(MVT::i1),
21565                          PgPattern);
21566          return DAG.getMaskedStore(MST->getChain(), DL, Value.getOperand(0),
21567                                    MST->getBasePtr(), MST->getOffset(), Mask,
21568                                    MST->getMemoryVT(), MST->getMemOperand(),
21569                                    MST->getAddressingMode(),
21570                                    /*IsTruncating=*/true);
21571        }
21572      }
21573    }
21574  }
21575
21576  if (MST->isTruncatingStore()) {
21577    EVT ValueVT = Value->getValueType(0);
21578    EVT MemVT = MST->getMemoryVT();
21579    if (!isHalvingTruncateOfLegalScalableType(ValueVT, MemVT))
21580      return SDValue();
21581    if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Value, DAG, Subtarget)) {
21582      return DAG.getMaskedStore(MST->getChain(), DL, Rshrnb, MST->getBasePtr(),
21583                                MST->getOffset(), MST->getMask(),
21584                                MST->getMemoryVT(), MST->getMemOperand(),
21585                                MST->getAddressingMode(), true);
21586    }
21587  }
21588
21589  return SDValue();
21590}
21591
21592/// \return true if part of the index was folded into the Base.
21593static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale,
21594                              SDLoc DL, SelectionDAG &DAG) {
21595  // This function assumes a vector of i64 indices.
21596  EVT IndexVT = Index.getValueType();
21597  if (!IndexVT.isVector() || IndexVT.getVectorElementType() != MVT::i64)
21598    return false;
21599
21600  // Simplify:
21601  //   BasePtr = Ptr
21602  //   Index = X + splat(Offset)
21603  // ->
21604  //   BasePtr = Ptr + Offset * scale.
21605  //   Index = X
21606  if (Index.getOpcode() == ISD::ADD) {
21607    if (auto Offset = DAG.getSplatValue(Index.getOperand(1))) {
21608      Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
21609      BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
21610      Index = Index.getOperand(0);
21611      return true;
21612    }
21613  }
21614
21615  // Simplify:
21616  //   BasePtr = Ptr
21617  //   Index = (X + splat(Offset)) << splat(Shift)
21618  // ->
21619  //   BasePtr = Ptr + (Offset << Shift) * scale)
21620  //   Index = X << splat(shift)
21621  if (Index.getOpcode() == ISD::SHL &&
21622      Index.getOperand(0).getOpcode() == ISD::ADD) {
21623    SDValue Add = Index.getOperand(0);
21624    SDValue ShiftOp = Index.getOperand(1);
21625    SDValue OffsetOp = Add.getOperand(1);
21626    if (auto Shift = DAG.getSplatValue(ShiftOp))
21627      if (auto Offset = DAG.getSplatValue(OffsetOp)) {
21628        Offset = DAG.getNode(ISD::SHL, DL, MVT::i64, Offset, Shift);
21629        Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
21630        BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
21631        Index = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
21632                            Add.getOperand(0), ShiftOp);
21633        return true;
21634      }
21635  }
21636
21637  return false;
21638}
21639
21640// Analyse the specified address returning true if a more optimal addressing
21641// mode is available. When returning true all parameters are updated to reflect
21642// their recommended values.
21643static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N,
21644                                     SDValue &BasePtr, SDValue &Index,
21645                                     SelectionDAG &DAG) {
21646  // Try to iteratively fold parts of the index into the base pointer to
21647  // simplify the index as much as possible.
21648  bool Changed = false;
21649  while (foldIndexIntoBase(BasePtr, Index, N->getScale(), SDLoc(N), DAG))
21650    Changed = true;
21651
21652  // Only consider element types that are pointer sized as smaller types can
21653  // be easily promoted.
21654  EVT IndexVT = Index.getValueType();
21655  if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64)
21656    return Changed;
21657
21658  // Can indices be trivially shrunk?
21659  EVT DataVT = N->getOperand(1).getValueType();
21660  // Don't attempt to shrink the index for fixed vectors of 64 bit data since it
21661  // will later be re-extended to 64 bits in legalization
21662  if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == 64)
21663    return Changed;
21664  if (ISD::isVectorShrinkable(Index.getNode(), 32, N->isIndexSigned())) {
21665    EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
21666    Index = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NewIndexVT, Index);
21667    return true;
21668  }
21669
21670  // Match:
21671  //   Index = step(const)
21672  int64_t Stride = 0;
21673  if (Index.getOpcode() == ISD::STEP_VECTOR) {
21674    Stride = cast<ConstantSDNode>(Index.getOperand(0))->getSExtValue();
21675  }
21676  // Match:
21677  //   Index = step(const) << shift(const)
21678  else if (Index.getOpcode() == ISD::SHL &&
21679           Index.getOperand(0).getOpcode() == ISD::STEP_VECTOR) {
21680    SDValue RHS = Index.getOperand(1);
21681    if (auto *Shift =
21682            dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(RHS))) {
21683      int64_t Step = (int64_t)Index.getOperand(0).getConstantOperandVal(1);
21684      Stride = Step << Shift->getZExtValue();
21685    }
21686  }
21687
21688  // Return early because no supported pattern is found.
21689  if (Stride == 0)
21690    return Changed;
21691
21692  if (Stride < std::numeric_limits<int32_t>::min() ||
21693      Stride > std::numeric_limits<int32_t>::max())
21694    return Changed;
21695
21696  const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
21697  unsigned MaxVScale =
21698      Subtarget.getMaxSVEVectorSizeInBits() / AArch64::SVEBitsPerBlock;
21699  int64_t LastElementOffset =
21700      IndexVT.getVectorMinNumElements() * Stride * MaxVScale;
21701
21702  if (LastElementOffset < std::numeric_limits<int32_t>::min() ||
21703      LastElementOffset > std::numeric_limits<int32_t>::max())
21704    return Changed;
21705
21706  EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
21707  // Stride does not scale explicitly by 'Scale', because it happens in
21708  // the gather/scatter addressing mode.
21709  Index = DAG.getStepVector(SDLoc(N), NewIndexVT, APInt(32, Stride));
21710  return true;
21711}
21712
21713static SDValue performMaskedGatherScatterCombine(
21714    SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) {
21715  MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(N);
21716  assert(MGS && "Can only combine gather load or scatter store nodes");
21717
21718  if (!DCI.isBeforeLegalize())
21719    return SDValue();
21720
21721  SDLoc DL(MGS);
21722  SDValue Chain = MGS->getChain();
21723  SDValue Scale = MGS->getScale();
21724  SDValue Index = MGS->getIndex();
21725  SDValue Mask = MGS->getMask();
21726  SDValue BasePtr = MGS->getBasePtr();
21727  ISD::MemIndexType IndexType = MGS->getIndexType();
21728
21729  if (!findMoreOptimalIndexType(MGS, BasePtr, Index, DAG))
21730    return SDValue();
21731
21732  // Here we catch such cases early and change MGATHER's IndexType to allow
21733  // the use of an Index that's more legalisation friendly.
21734  if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) {
21735    SDValue PassThru = MGT->getPassThru();
21736    SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
21737    return DAG.getMaskedGather(
21738        DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
21739        Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
21740  }
21741  auto *MSC = cast<MaskedScatterSDNode>(MGS);
21742  SDValue Data = MSC->getValue();
21743  SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
21744  return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(), DL,
21745                              Ops, MSC->getMemOperand(), IndexType,
21746                              MSC->isTruncatingStore());
21747}
21748
21749/// Target-specific DAG combine function for NEON load/store intrinsics
21750/// to merge base address updates.
21751static SDValue performNEONPostLDSTCombine(SDNode *N,
21752                                          TargetLowering::DAGCombinerInfo &DCI,
21753                                          SelectionDAG &DAG) {
21754  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
21755    return SDValue();
21756
21757  unsigned AddrOpIdx = N->getNumOperands() - 1;
21758  SDValue Addr = N->getOperand(AddrOpIdx);
21759
21760  // Search for a use of the address operand that is an increment.
21761  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
21762       UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
21763    SDNode *User = *UI;
21764    if (User->getOpcode() != ISD::ADD ||
21765        UI.getUse().getResNo() != Addr.getResNo())
21766      continue;
21767
21768    // Check that the add is independent of the load/store.  Otherwise, folding
21769    // it would create a cycle.
21770    SmallPtrSet<const SDNode *, 32> Visited;
21771    SmallVector<const SDNode *, 16> Worklist;
21772    Visited.insert(Addr.getNode());
21773    Worklist.push_back(N);
21774    Worklist.push_back(User);
21775    if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
21776        SDNode::hasPredecessorHelper(User, Visited, Worklist))
21777      continue;
21778
21779    // Find the new opcode for the updating load/store.
21780    bool IsStore = false;
21781    bool IsLaneOp = false;
21782    bool IsDupOp = false;
21783    unsigned NewOpc = 0;
21784    unsigned NumVecs = 0;
21785    unsigned IntNo = N->getConstantOperandVal(1);
21786    switch (IntNo) {
21787    default: llvm_unreachable("unexpected intrinsic for Neon base update");
21788    case Intrinsic::aarch64_neon_ld2:       NewOpc = AArch64ISD::LD2post;
21789      NumVecs = 2; break;
21790    case Intrinsic::aarch64_neon_ld3:       NewOpc = AArch64ISD::LD3post;
21791      NumVecs = 3; break;
21792    case Intrinsic::aarch64_neon_ld4:       NewOpc = AArch64ISD::LD4post;
21793      NumVecs = 4; break;
21794    case Intrinsic::aarch64_neon_st2:       NewOpc = AArch64ISD::ST2post;
21795      NumVecs = 2; IsStore = true; break;
21796    case Intrinsic::aarch64_neon_st3:       NewOpc = AArch64ISD::ST3post;
21797      NumVecs = 3; IsStore = true; break;
21798    case Intrinsic::aarch64_neon_st4:       NewOpc = AArch64ISD::ST4post;
21799      NumVecs = 4; IsStore = true; break;
21800    case Intrinsic::aarch64_neon_ld1x2:     NewOpc = AArch64ISD::LD1x2post;
21801      NumVecs = 2; break;
21802    case Intrinsic::aarch64_neon_ld1x3:     NewOpc = AArch64ISD::LD1x3post;
21803      NumVecs = 3; break;
21804    case Intrinsic::aarch64_neon_ld1x4:     NewOpc = AArch64ISD::LD1x4post;
21805      NumVecs = 4; break;
21806    case Intrinsic::aarch64_neon_st1x2:     NewOpc = AArch64ISD::ST1x2post;
21807      NumVecs = 2; IsStore = true; break;
21808    case Intrinsic::aarch64_neon_st1x3:     NewOpc = AArch64ISD::ST1x3post;
21809      NumVecs = 3; IsStore = true; break;
21810    case Intrinsic::aarch64_neon_st1x4:     NewOpc = AArch64ISD::ST1x4post;
21811      NumVecs = 4; IsStore = true; break;
21812    case Intrinsic::aarch64_neon_ld2r:      NewOpc = AArch64ISD::LD2DUPpost;
21813      NumVecs = 2; IsDupOp = true; break;
21814    case Intrinsic::aarch64_neon_ld3r:      NewOpc = AArch64ISD::LD3DUPpost;
21815      NumVecs = 3; IsDupOp = true; break;
21816    case Intrinsic::aarch64_neon_ld4r:      NewOpc = AArch64ISD::LD4DUPpost;
21817      NumVecs = 4; IsDupOp = true; break;
21818    case Intrinsic::aarch64_neon_ld2lane:   NewOpc = AArch64ISD::LD2LANEpost;
21819      NumVecs = 2; IsLaneOp = true; break;
21820    case Intrinsic::aarch64_neon_ld3lane:   NewOpc = AArch64ISD::LD3LANEpost;
21821      NumVecs = 3; IsLaneOp = true; break;
21822    case Intrinsic::aarch64_neon_ld4lane:   NewOpc = AArch64ISD::LD4LANEpost;
21823      NumVecs = 4; IsLaneOp = true; break;
21824    case Intrinsic::aarch64_neon_st2lane:   NewOpc = AArch64ISD::ST2LANEpost;
21825      NumVecs = 2; IsStore = true; IsLaneOp = true; break;
21826    case Intrinsic::aarch64_neon_st3lane:   NewOpc = AArch64ISD::ST3LANEpost;
21827      NumVecs = 3; IsStore = true; IsLaneOp = true; break;
21828    case Intrinsic::aarch64_neon_st4lane:   NewOpc = AArch64ISD::ST4LANEpost;
21829      NumVecs = 4; IsStore = true; IsLaneOp = true; break;
21830    }
21831
21832    EVT VecTy;
21833    if (IsStore)
21834      VecTy = N->getOperand(2).getValueType();
21835    else
21836      VecTy = N->getValueType(0);
21837
21838    // If the increment is a constant, it must match the memory ref size.
21839    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
21840    if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
21841      uint32_t IncVal = CInc->getZExtValue();
21842      unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
21843      if (IsLaneOp || IsDupOp)
21844        NumBytes /= VecTy.getVectorNumElements();
21845      if (IncVal != NumBytes)
21846        continue;
21847      Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
21848    }
21849    SmallVector<SDValue, 8> Ops;
21850    Ops.push_back(N->getOperand(0)); // Incoming chain
21851    // Load lane and store have vector list as input.
21852    if (IsLaneOp || IsStore)
21853      for (unsigned i = 2; i < AddrOpIdx; ++i)
21854        Ops.push_back(N->getOperand(i));
21855    Ops.push_back(Addr); // Base register
21856    Ops.push_back(Inc);
21857
21858    // Return Types.
21859    EVT Tys[6];
21860    unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
21861    unsigned n;
21862    for (n = 0; n < NumResultVecs; ++n)
21863      Tys[n] = VecTy;
21864    Tys[n++] = MVT::i64;  // Type of write back register
21865    Tys[n] = MVT::Other;  // Type of the chain
21866    SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
21867
21868    MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
21869    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
21870                                           MemInt->getMemoryVT(),
21871                                           MemInt->getMemOperand());
21872
21873    // Update the uses.
21874    std::vector<SDValue> NewResults;
21875    for (unsigned i = 0; i < NumResultVecs; ++i) {
21876      NewResults.push_back(SDValue(UpdN.getNode(), i));
21877    }
21878    NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
21879    DCI.CombineTo(N, NewResults);
21880    DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
21881
21882    break;
21883  }
21884  return SDValue();
21885}
21886
21887// Checks to see if the value is the prescribed width and returns information
21888// about its extension mode.
21889static
21890bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
21891  ExtType = ISD::NON_EXTLOAD;
21892  switch(V.getNode()->getOpcode()) {
21893  default:
21894    return false;
21895  case ISD::LOAD: {
21896    LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
21897    if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
21898       || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
21899      ExtType = LoadNode->getExtensionType();
21900      return true;
21901    }
21902    return false;
21903  }
21904  case ISD::AssertSext: {
21905    VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
21906    if ((TypeNode->getVT() == MVT::i8 && width == 8)
21907       || (TypeNode->getVT() == MVT::i16 && width == 16)) {
21908      ExtType = ISD::SEXTLOAD;
21909      return true;
21910    }
21911    return false;
21912  }
21913  case ISD::AssertZext: {
21914    VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
21915    if ((TypeNode->getVT() == MVT::i8 && width == 8)
21916       || (TypeNode->getVT() == MVT::i16 && width == 16)) {
21917      ExtType = ISD::ZEXTLOAD;
21918      return true;
21919    }
21920    return false;
21921  }
21922  case ISD::Constant:
21923  case ISD::TargetConstant: {
21924    return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
21925           1LL << (width - 1);
21926  }
21927  }
21928
21929  return true;
21930}
21931
21932// This function does a whole lot of voodoo to determine if the tests are
21933// equivalent without and with a mask. Essentially what happens is that given a
21934// DAG resembling:
21935//
21936//  +-------------+ +-------------+ +-------------+ +-------------+
21937//  |    Input    | | AddConstant | | CompConstant| |     CC      |
21938//  +-------------+ +-------------+ +-------------+ +-------------+
21939//           |           |           |               |
21940//           V           V           |    +----------+
21941//          +-------------+  +----+  |    |
21942//          |     ADD     |  |0xff|  |    |
21943//          +-------------+  +----+  |    |
21944//                  |           |    |    |
21945//                  V           V    |    |
21946//                 +-------------+   |    |
21947//                 |     AND     |   |    |
21948//                 +-------------+   |    |
21949//                      |            |    |
21950//                      +-----+      |    |
21951//                            |      |    |
21952//                            V      V    V
21953//                           +-------------+
21954//                           |     CMP     |
21955//                           +-------------+
21956//
21957// The AND node may be safely removed for some combinations of inputs. In
21958// particular we need to take into account the extension type of the Input,
21959// the exact values of AddConstant, CompConstant, and CC, along with the nominal
21960// width of the input (this can work for any width inputs, the above graph is
21961// specific to 8 bits.
21962//
21963// The specific equations were worked out by generating output tables for each
21964// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
21965// problem was simplified by working with 4 bit inputs, which means we only
21966// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
21967// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
21968// patterns present in both extensions (0,7). For every distinct set of
21969// AddConstant and CompConstants bit patterns we can consider the masked and
21970// unmasked versions to be equivalent if the result of this function is true for
21971// all 16 distinct bit patterns of for the current extension type of Input (w0).
21972//
21973//   sub      w8, w0, w1
21974//   and      w10, w8, #0x0f
21975//   cmp      w8, w2
21976//   cset     w9, AArch64CC
21977//   cmp      w10, w2
21978//   cset     w11, AArch64CC
21979//   cmp      w9, w11
21980//   cset     w0, eq
21981//   ret
21982//
21983// Since the above function shows when the outputs are equivalent it defines
21984// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
21985// would be expensive to run during compiles. The equations below were written
21986// in a test harness that confirmed they gave equivalent outputs to the above
21987// for all inputs function, so they can be used determine if the removal is
21988// legal instead.
21989//
21990// isEquivalentMaskless() is the code for testing if the AND can be removed
21991// factored out of the DAG recognition as the DAG can take several forms.
21992
21993static bool isEquivalentMaskless(unsigned CC, unsigned width,
21994                                 ISD::LoadExtType ExtType, int AddConstant,
21995                                 int CompConstant) {
21996  // By being careful about our equations and only writing the in term
21997  // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
21998  // make them generally applicable to all bit widths.
21999  int MaxUInt = (1 << width);
22000
22001  // For the purposes of these comparisons sign extending the type is
22002  // equivalent to zero extending the add and displacing it by half the integer
22003  // width. Provided we are careful and make sure our equations are valid over
22004  // the whole range we can just adjust the input and avoid writing equations
22005  // for sign extended inputs.
22006  if (ExtType == ISD::SEXTLOAD)
22007    AddConstant -= (1 << (width-1));
22008
22009  switch(CC) {
22010  case AArch64CC::LE:
22011  case AArch64CC::GT:
22012    if ((AddConstant == 0) ||
22013        (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
22014        (AddConstant >= 0 && CompConstant < 0) ||
22015        (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
22016      return true;
22017    break;
22018  case AArch64CC::LT:
22019  case AArch64CC::GE:
22020    if ((AddConstant == 0) ||
22021        (AddConstant >= 0 && CompConstant <= 0) ||
22022        (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
22023      return true;
22024    break;
22025  case AArch64CC::HI:
22026  case AArch64CC::LS:
22027    if ((AddConstant >= 0 && CompConstant < 0) ||
22028       (AddConstant <= 0 && CompConstant >= -1 &&
22029        CompConstant < AddConstant + MaxUInt))
22030      return true;
22031   break;
22032  case AArch64CC::PL:
22033  case AArch64CC::MI:
22034    if ((AddConstant == 0) ||
22035        (AddConstant > 0 && CompConstant <= 0) ||
22036        (AddConstant < 0 && CompConstant <= AddConstant))
22037      return true;
22038    break;
22039  case AArch64CC::LO:
22040  case AArch64CC::HS:
22041    if ((AddConstant >= 0 && CompConstant <= 0) ||
22042        (AddConstant <= 0 && CompConstant >= 0 &&
22043         CompConstant <= AddConstant + MaxUInt))
22044      return true;
22045    break;
22046  case AArch64CC::EQ:
22047  case AArch64CC::NE:
22048    if ((AddConstant > 0 && CompConstant < 0) ||
22049        (AddConstant < 0 && CompConstant >= 0 &&
22050         CompConstant < AddConstant + MaxUInt) ||
22051        (AddConstant >= 0 && CompConstant >= 0 &&
22052         CompConstant >= AddConstant) ||
22053        (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
22054      return true;
22055    break;
22056  case AArch64CC::VS:
22057  case AArch64CC::VC:
22058  case AArch64CC::AL:
22059  case AArch64CC::NV:
22060    return true;
22061  case AArch64CC::Invalid:
22062    break;
22063  }
22064
22065  return false;
22066}
22067
22068// (X & C) >u Mask --> (X & (C & (~Mask)) != 0
22069// (X & C) <u Pow2 --> (X & (C & ~(Pow2-1)) == 0
22070static SDValue performSubsToAndsCombine(SDNode *N, SDNode *SubsNode,
22071                                        SDNode *AndNode, SelectionDAG &DAG,
22072                                        unsigned CCIndex, unsigned CmpIndex,
22073                                        unsigned CC) {
22074  ConstantSDNode *SubsC = dyn_cast<ConstantSDNode>(SubsNode->getOperand(1));
22075  if (!SubsC)
22076    return SDValue();
22077
22078  APInt SubsAP = SubsC->getAPIntValue();
22079  if (CC == AArch64CC::HI) {
22080    if (!SubsAP.isMask())
22081      return SDValue();
22082  } else if (CC == AArch64CC::LO) {
22083    if (!SubsAP.isPowerOf2())
22084      return SDValue();
22085  } else
22086    return SDValue();
22087
22088  ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(AndNode->getOperand(1));
22089  if (!AndC)
22090    return SDValue();
22091
22092  APInt MaskAP = CC == AArch64CC::HI ? SubsAP : (SubsAP - 1);
22093
22094  SDLoc DL(N);
22095  APInt AndSMask = (~MaskAP) & AndC->getAPIntValue();
22096  SDValue ANDS = DAG.getNode(
22097      AArch64ISD::ANDS, DL, SubsNode->getVTList(), AndNode->getOperand(0),
22098      DAG.getConstant(AndSMask, DL, SubsC->getValueType(0)));
22099  SDValue AArch64_CC =
22100      DAG.getConstant(CC == AArch64CC::HI ? AArch64CC::NE : AArch64CC::EQ, DL,
22101                      N->getOperand(CCIndex)->getValueType(0));
22102
22103  // For now, only performCSELCombine and performBRCONDCombine call this
22104  // function. And both of them pass 2 for CCIndex, 3 for CmpIndex with 4
22105  // operands. So just init the ops direct to simplify the code. If we have some
22106  // other case with different CCIndex, CmpIndex, we need to use for loop to
22107  // rewrite the code here.
22108  // TODO: Do we need to assert number of operand is 4 here?
22109  assert((CCIndex == 2 && CmpIndex == 3) &&
22110         "Expected CCIndex to be 2 and CmpIndex to be 3.");
22111  SDValue Ops[] = {N->getOperand(0), N->getOperand(1), AArch64_CC,
22112                   ANDS.getValue(1)};
22113  return DAG.getNode(N->getOpcode(), N, N->getVTList(), Ops);
22114}
22115
22116static
22117SDValue performCONDCombine(SDNode *N,
22118                           TargetLowering::DAGCombinerInfo &DCI,
22119                           SelectionDAG &DAG, unsigned CCIndex,
22120                           unsigned CmpIndex) {
22121  unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
22122  SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
22123  unsigned CondOpcode = SubsNode->getOpcode();
22124
22125  if (CondOpcode != AArch64ISD::SUBS || SubsNode->hasAnyUseOfValue(0) ||
22126      !SubsNode->hasOneUse())
22127    return SDValue();
22128
22129  // There is a SUBS feeding this condition. Is it fed by a mask we can
22130  // use?
22131
22132  SDNode *AndNode = SubsNode->getOperand(0).getNode();
22133  unsigned MaskBits = 0;
22134
22135  if (AndNode->getOpcode() != ISD::AND)
22136    return SDValue();
22137
22138  if (SDValue Val = performSubsToAndsCombine(N, SubsNode, AndNode, DAG, CCIndex,
22139                                             CmpIndex, CC))
22140    return Val;
22141
22142  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
22143    uint32_t CNV = CN->getZExtValue();
22144    if (CNV == 255)
22145      MaskBits = 8;
22146    else if (CNV == 65535)
22147      MaskBits = 16;
22148  }
22149
22150  if (!MaskBits)
22151    return SDValue();
22152
22153  SDValue AddValue = AndNode->getOperand(0);
22154
22155  if (AddValue.getOpcode() != ISD::ADD)
22156    return SDValue();
22157
22158  // The basic dag structure is correct, grab the inputs and validate them.
22159
22160  SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
22161  SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
22162  SDValue SubsInputValue = SubsNode->getOperand(1);
22163
22164  // The mask is present and the provenance of all the values is a smaller type,
22165  // lets see if the mask is superfluous.
22166
22167  if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
22168      !isa<ConstantSDNode>(SubsInputValue.getNode()))
22169    return SDValue();
22170
22171  ISD::LoadExtType ExtType;
22172
22173  if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
22174      !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
22175      !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
22176    return SDValue();
22177
22178  if(!isEquivalentMaskless(CC, MaskBits, ExtType,
22179                cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
22180                cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
22181    return SDValue();
22182
22183  // The AND is not necessary, remove it.
22184
22185  SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
22186                               SubsNode->getValueType(1));
22187  SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
22188
22189  SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
22190  DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
22191
22192  return SDValue(N, 0);
22193}
22194
22195// Optimize compare with zero and branch.
22196static SDValue performBRCONDCombine(SDNode *N,
22197                                    TargetLowering::DAGCombinerInfo &DCI,
22198                                    SelectionDAG &DAG) {
22199  MachineFunction &MF = DAG.getMachineFunction();
22200  // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
22201  // will not be produced, as they are conditional branch instructions that do
22202  // not set flags.
22203  if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
22204    return SDValue();
22205
22206  if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
22207    N = NV.getNode();
22208  SDValue Chain = N->getOperand(0);
22209  SDValue Dest = N->getOperand(1);
22210  SDValue CCVal = N->getOperand(2);
22211  SDValue Cmp = N->getOperand(3);
22212
22213  assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
22214  unsigned CC = CCVal->getAsZExtVal();
22215  if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
22216    return SDValue();
22217
22218  unsigned CmpOpc = Cmp.getOpcode();
22219  if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
22220    return SDValue();
22221
22222  // Only attempt folding if there is only one use of the flag and no use of the
22223  // value.
22224  if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
22225    return SDValue();
22226
22227  SDValue LHS = Cmp.getOperand(0);
22228  SDValue RHS = Cmp.getOperand(1);
22229
22230  assert(LHS.getValueType() == RHS.getValueType() &&
22231         "Expected the value type to be the same for both operands!");
22232  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
22233    return SDValue();
22234
22235  if (isNullConstant(LHS))
22236    std::swap(LHS, RHS);
22237
22238  if (!isNullConstant(RHS))
22239    return SDValue();
22240
22241  if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
22242      LHS.getOpcode() == ISD::SRL)
22243    return SDValue();
22244
22245  // Fold the compare into the branch instruction.
22246  SDValue BR;
22247  if (CC == AArch64CC::EQ)
22248    BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
22249  else
22250    BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
22251
22252  // Do not add new nodes to DAG combiner worklist.
22253  DCI.CombineTo(N, BR, false);
22254
22255  return SDValue();
22256}
22257
22258static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG) {
22259  unsigned CC = N->getConstantOperandVal(2);
22260  SDValue SUBS = N->getOperand(3);
22261  SDValue Zero, CTTZ;
22262
22263  if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) {
22264    Zero = N->getOperand(0);
22265    CTTZ = N->getOperand(1);
22266  } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) {
22267    Zero = N->getOperand(1);
22268    CTTZ = N->getOperand(0);
22269  } else
22270    return SDValue();
22271
22272  if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) ||
22273      (CTTZ.getOpcode() == ISD::TRUNCATE &&
22274       CTTZ.getOperand(0).getOpcode() != ISD::CTTZ))
22275    return SDValue();
22276
22277  assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) &&
22278         "Illegal type in CTTZ folding");
22279
22280  if (!isNullConstant(Zero) || !isNullConstant(SUBS.getOperand(1)))
22281    return SDValue();
22282
22283  SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE
22284                  ? CTTZ.getOperand(0).getOperand(0)
22285                  : CTTZ.getOperand(0);
22286
22287  if (X != SUBS.getOperand(0))
22288    return SDValue();
22289
22290  unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE
22291                          ? CTTZ.getOperand(0).getValueSizeInBits()
22292                          : CTTZ.getValueSizeInBits();
22293  SDValue BitWidthMinusOne =
22294      DAG.getConstant(BitWidth - 1, SDLoc(N), CTTZ.getValueType());
22295  return DAG.getNode(ISD::AND, SDLoc(N), CTTZ.getValueType(), CTTZ,
22296                     BitWidthMinusOne);
22297}
22298
22299// (CSEL l r EQ (CMP (CSEL x y cc2 cond) x)) => (CSEL l r cc2 cond)
22300// (CSEL l r EQ (CMP (CSEL x y cc2 cond) y)) => (CSEL l r !cc2 cond)
22301// Where x and y are constants and x != y
22302
22303// (CSEL l r NE (CMP (CSEL x y cc2 cond) x)) => (CSEL l r !cc2 cond)
22304// (CSEL l r NE (CMP (CSEL x y cc2 cond) y)) => (CSEL l r cc2 cond)
22305// Where x and y are constants and x != y
22306static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG) {
22307  SDValue L = Op->getOperand(0);
22308  SDValue R = Op->getOperand(1);
22309  AArch64CC::CondCode OpCC =
22310      static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2));
22311
22312  SDValue OpCmp = Op->getOperand(3);
22313  if (!isCMP(OpCmp))
22314    return SDValue();
22315
22316  SDValue CmpLHS = OpCmp.getOperand(0);
22317  SDValue CmpRHS = OpCmp.getOperand(1);
22318
22319  if (CmpRHS.getOpcode() == AArch64ISD::CSEL)
22320    std::swap(CmpLHS, CmpRHS);
22321  else if (CmpLHS.getOpcode() != AArch64ISD::CSEL)
22322    return SDValue();
22323
22324  SDValue X = CmpLHS->getOperand(0);
22325  SDValue Y = CmpLHS->getOperand(1);
22326  if (!isa<ConstantSDNode>(X) || !isa<ConstantSDNode>(Y) || X == Y) {
22327    return SDValue();
22328  }
22329
22330  // If one of the constant is opaque constant, x,y sdnode is still different
22331  // but the real value maybe the same. So check APInt here to make sure the
22332  // code is correct.
22333  ConstantSDNode *CX = cast<ConstantSDNode>(X);
22334  ConstantSDNode *CY = cast<ConstantSDNode>(Y);
22335  if (CX->getAPIntValue() == CY->getAPIntValue())
22336    return SDValue();
22337
22338  AArch64CC::CondCode CC =
22339      static_cast<AArch64CC::CondCode>(CmpLHS->getConstantOperandVal(2));
22340  SDValue Cond = CmpLHS->getOperand(3);
22341
22342  if (CmpRHS == Y)
22343    CC = AArch64CC::getInvertedCondCode(CC);
22344  else if (CmpRHS != X)
22345    return SDValue();
22346
22347  if (OpCC == AArch64CC::NE)
22348    CC = AArch64CC::getInvertedCondCode(CC);
22349  else if (OpCC != AArch64CC::EQ)
22350    return SDValue();
22351
22352  SDLoc DL(Op);
22353  EVT VT = Op->getValueType(0);
22354
22355  SDValue CCValue = DAG.getConstant(CC, DL, MVT::i32);
22356  return DAG.getNode(AArch64ISD::CSEL, DL, VT, L, R, CCValue, Cond);
22357}
22358
22359// Optimize CSEL instructions
22360static SDValue performCSELCombine(SDNode *N,
22361                                  TargetLowering::DAGCombinerInfo &DCI,
22362                                  SelectionDAG &DAG) {
22363  // CSEL x, x, cc -> x
22364  if (N->getOperand(0) == N->getOperand(1))
22365    return N->getOperand(0);
22366
22367  if (SDValue R = foldCSELOfCSEL(N, DAG))
22368    return R;
22369
22370  // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
22371  // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
22372  if (SDValue Folded = foldCSELofCTTZ(N, DAG))
22373		return Folded;
22374
22375  return performCONDCombine(N, DCI, DAG, 2, 3);
22376}
22377
22378// Try to re-use an already extended operand of a vector SetCC feeding a
22379// extended select. Doing so avoids requiring another full extension of the
22380// SET_CC result when lowering the select.
22381static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG) {
22382  EVT Op0MVT = Op->getOperand(0).getValueType();
22383  if (!Op0MVT.isVector() || Op->use_empty())
22384    return SDValue();
22385
22386  // Make sure that all uses of Op are VSELECTs with result matching types where
22387  // the result type has a larger element type than the SetCC operand.
22388  SDNode *FirstUse = *Op->use_begin();
22389  if (FirstUse->getOpcode() != ISD::VSELECT)
22390    return SDValue();
22391  EVT UseMVT = FirstUse->getValueType(0);
22392  if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())
22393    return SDValue();
22394  if (any_of(Op->uses(), [&UseMVT](const SDNode *N) {
22395        return N->getOpcode() != ISD::VSELECT || N->getValueType(0) != UseMVT;
22396      }))
22397    return SDValue();
22398
22399  APInt V;
22400  if (!ISD::isConstantSplatVector(Op->getOperand(1).getNode(), V))
22401    return SDValue();
22402
22403  SDLoc DL(Op);
22404  SDValue Op0ExtV;
22405  SDValue Op1ExtV;
22406  ISD::CondCode CC = cast<CondCodeSDNode>(Op->getOperand(2))->get();
22407  // Check if the first operand of the SET_CC is already extended. If it is,
22408  // split the SET_CC and re-use the extended version of the operand.
22409  SDNode *Op0SExt = DAG.getNodeIfExists(ISD::SIGN_EXTEND, DAG.getVTList(UseMVT),
22410                                        Op->getOperand(0));
22411  SDNode *Op0ZExt = DAG.getNodeIfExists(ISD::ZERO_EXTEND, DAG.getVTList(UseMVT),
22412                                        Op->getOperand(0));
22413  if (Op0SExt && (isSignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
22414    Op0ExtV = SDValue(Op0SExt, 0);
22415    Op1ExtV = DAG.getNode(ISD::SIGN_EXTEND, DL, UseMVT, Op->getOperand(1));
22416  } else if (Op0ZExt && (isUnsignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
22417    Op0ExtV = SDValue(Op0ZExt, 0);
22418    Op1ExtV = DAG.getNode(ISD::ZERO_EXTEND, DL, UseMVT, Op->getOperand(1));
22419  } else
22420    return SDValue();
22421
22422  return DAG.getNode(ISD::SETCC, DL, UseMVT.changeVectorElementType(MVT::i1),
22423                     Op0ExtV, Op1ExtV, Op->getOperand(2));
22424}
22425
22426static SDValue
22427performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
22428                               SelectionDAG &DAG) {
22429  SDValue Vec = N->getOperand(0);
22430  if (DCI.isBeforeLegalize() &&
22431      Vec.getValueType().getVectorElementType() == MVT::i1 &&
22432      Vec.getValueType().isFixedLengthVector() &&
22433      Vec.getValueType().isPow2VectorType()) {
22434    SDLoc DL(N);
22435    return getVectorBitwiseReduce(N->getOpcode(), Vec, N->getValueType(0), DL,
22436                                  DAG);
22437  }
22438
22439  return SDValue();
22440}
22441
22442static SDValue performSETCCCombine(SDNode *N,
22443                                   TargetLowering::DAGCombinerInfo &DCI,
22444                                   SelectionDAG &DAG) {
22445  assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
22446  SDValue LHS = N->getOperand(0);
22447  SDValue RHS = N->getOperand(1);
22448  ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
22449  SDLoc DL(N);
22450  EVT VT = N->getValueType(0);
22451
22452  if (SDValue V = tryToWidenSetCCOperands(N, DAG))
22453    return V;
22454
22455  // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
22456  if (Cond == ISD::SETNE && isOneConstant(RHS) &&
22457      LHS->getOpcode() == AArch64ISD::CSEL &&
22458      isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
22459      LHS->hasOneUse()) {
22460    // Invert CSEL's condition.
22461    auto OldCond =
22462        static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
22463    auto NewCond = getInvertedCondCode(OldCond);
22464
22465    // csel 0, 1, !cond, X
22466    SDValue CSEL =
22467        DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(), LHS.getOperand(0),
22468                    LHS.getOperand(1), DAG.getConstant(NewCond, DL, MVT::i32),
22469                    LHS.getOperand(3));
22470    return DAG.getZExtOrTrunc(CSEL, DL, VT);
22471  }
22472
22473  // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne
22474  if (Cond == ISD::SETNE && isNullConstant(RHS) &&
22475      LHS->getOpcode() == ISD::SRL && isa<ConstantSDNode>(LHS->getOperand(1)) &&
22476      LHS->getConstantOperandVal(1) < VT.getScalarSizeInBits() &&
22477      LHS->hasOneUse()) {
22478    EVT TstVT = LHS->getValueType(0);
22479    if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64) {
22480      // this pattern will get better opt in emitComparison
22481      uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(1);
22482      SDValue TST = DAG.getNode(ISD::AND, DL, TstVT, LHS->getOperand(0),
22483                                DAG.getConstant(TstImm, DL, TstVT));
22484      return DAG.getNode(ISD::SETCC, DL, VT, TST, RHS, N->getOperand(2));
22485    }
22486  }
22487
22488  // setcc (iN (bitcast (vNi1 X))), 0, (eq|ne)
22489  //   ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq|ne)
22490  // setcc (iN (bitcast (vNi1 X))), -1, (eq|ne)
22491  //   ==> setcc (iN (sext (i1 (vecreduce_and (vNi1 X))))), -1, (eq|ne)
22492  if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
22493      (Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
22494      (isNullConstant(RHS) || isAllOnesConstant(RHS)) &&
22495      LHS->getOpcode() == ISD::BITCAST) {
22496    EVT ToVT = LHS->getValueType(0);
22497    EVT FromVT = LHS->getOperand(0).getValueType();
22498    if (FromVT.isFixedLengthVector() &&
22499        FromVT.getVectorElementType() == MVT::i1) {
22500      bool IsNull = isNullConstant(RHS);
22501      LHS = DAG.getNode(IsNull ? ISD::VECREDUCE_OR : ISD::VECREDUCE_AND,
22502                        DL, MVT::i1, LHS->getOperand(0));
22503      LHS = DAG.getNode(IsNull ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL, ToVT,
22504                        LHS);
22505      return DAG.getSetCC(DL, VT, LHS, RHS, Cond);
22506    }
22507  }
22508
22509  // Try to perform the memcmp when the result is tested for [in]equality with 0
22510  if (SDValue V = performOrXorChainCombine(N, DAG))
22511    return V;
22512
22513  return SDValue();
22514}
22515
22516// Replace a flag-setting operator (eg ANDS) with the generic version
22517// (eg AND) if the flag is unused.
22518static SDValue performFlagSettingCombine(SDNode *N,
22519                                         TargetLowering::DAGCombinerInfo &DCI,
22520                                         unsigned GenericOpcode) {
22521  SDLoc DL(N);
22522  SDValue LHS = N->getOperand(0);
22523  SDValue RHS = N->getOperand(1);
22524  EVT VT = N->getValueType(0);
22525
22526  // If the flag result isn't used, convert back to a generic opcode.
22527  if (!N->hasAnyUseOfValue(1)) {
22528    SDValue Res = DCI.DAG.getNode(GenericOpcode, DL, VT, N->ops());
22529    return DCI.DAG.getMergeValues({Res, DCI.DAG.getConstant(0, DL, MVT::i32)},
22530                                  DL);
22531  }
22532
22533  // Combine identical generic nodes into this node, re-using the result.
22534  if (SDNode *Generic = DCI.DAG.getNodeIfExists(
22535          GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS}))
22536    DCI.CombineTo(Generic, SDValue(N, 0));
22537
22538  return SDValue();
22539}
22540
22541static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG) {
22542  // setcc_merge_zero pred
22543  //   (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne
22544  //   => extract_subvector (inner setcc_merge_zero)
22545  SDValue Pred = N->getOperand(0);
22546  SDValue LHS = N->getOperand(1);
22547  SDValue RHS = N->getOperand(2);
22548  ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
22549
22550  if (Cond != ISD::SETNE || !isZerosVector(RHS.getNode()) ||
22551      LHS->getOpcode() != ISD::SIGN_EXTEND)
22552    return SDValue();
22553
22554  SDValue Extract = LHS->getOperand(0);
22555  if (Extract->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
22556      Extract->getValueType(0) != N->getValueType(0) ||
22557      Extract->getConstantOperandVal(1) != 0)
22558    return SDValue();
22559
22560  SDValue InnerSetCC = Extract->getOperand(0);
22561  if (InnerSetCC->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO)
22562    return SDValue();
22563
22564  // By this point we've effectively got
22565  // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive
22566  // lanes are already zero then the trunc(sext()) sequence is redundant and we
22567  // can operate on A directly.
22568  SDValue InnerPred = InnerSetCC.getOperand(0);
22569  if (Pred.getOpcode() == AArch64ISD::PTRUE &&
22570      InnerPred.getOpcode() == AArch64ISD::PTRUE &&
22571      Pred.getConstantOperandVal(0) == InnerPred.getConstantOperandVal(0) &&
22572      Pred->getConstantOperandVal(0) >= AArch64SVEPredPattern::vl1 &&
22573      Pred->getConstantOperandVal(0) <= AArch64SVEPredPattern::vl256)
22574    return Extract;
22575
22576  return SDValue();
22577}
22578
22579static SDValue
22580performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
22581  assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
22582         "Unexpected opcode!");
22583
22584  SelectionDAG &DAG = DCI.DAG;
22585  SDValue Pred = N->getOperand(0);
22586  SDValue LHS = N->getOperand(1);
22587  SDValue RHS = N->getOperand(2);
22588  ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
22589
22590  if (SDValue V = performSetCCPunpkCombine(N, DAG))
22591    return V;
22592
22593  if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
22594      LHS->getOpcode() == ISD::SIGN_EXTEND &&
22595      LHS->getOperand(0)->getValueType(0) == N->getValueType(0)) {
22596    //    setcc_merge_zero(
22597    //       pred, extend(setcc_merge_zero(pred, ...)), != splat(0))
22598    // => setcc_merge_zero(pred, ...)
22599    if (LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
22600        LHS->getOperand(0)->getOperand(0) == Pred)
22601      return LHS->getOperand(0);
22602
22603    //    setcc_merge_zero(
22604    //        all_active, extend(nxvNi1 ...), != splat(0))
22605    // -> nxvNi1 ...
22606    if (isAllActivePredicate(DAG, Pred))
22607      return LHS->getOperand(0);
22608
22609    //    setcc_merge_zero(
22610    //        pred, extend(nxvNi1 ...), != splat(0))
22611    // -> nxvNi1 and(pred, ...)
22612    if (DCI.isAfterLegalizeDAG())
22613      // Do this after legalization to allow more folds on setcc_merge_zero
22614      // to be recognized.
22615      return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0),
22616                         LHS->getOperand(0), Pred);
22617  }
22618
22619  return SDValue();
22620}
22621
22622// Optimize some simple tbz/tbnz cases.  Returns the new operand and bit to test
22623// as well as whether the test should be inverted.  This code is required to
22624// catch these cases (as opposed to standard dag combines) because
22625// AArch64ISD::TBZ is matched during legalization.
22626static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
22627                                 SelectionDAG &DAG) {
22628
22629  if (!Op->hasOneUse())
22630    return Op;
22631
22632  // We don't handle undef/constant-fold cases below, as they should have
22633  // already been taken care of (e.g. and of 0, test of undefined shifted bits,
22634  // etc.)
22635
22636  // (tbz (trunc x), b) -> (tbz x, b)
22637  // This case is just here to enable more of the below cases to be caught.
22638  if (Op->getOpcode() == ISD::TRUNCATE &&
22639      Bit < Op->getValueType(0).getSizeInBits()) {
22640    return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
22641  }
22642
22643  // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
22644  if (Op->getOpcode() == ISD::ANY_EXTEND &&
22645      Bit < Op->getOperand(0).getValueSizeInBits()) {
22646    return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
22647  }
22648
22649  if (Op->getNumOperands() != 2)
22650    return Op;
22651
22652  auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
22653  if (!C)
22654    return Op;
22655
22656  switch (Op->getOpcode()) {
22657  default:
22658    return Op;
22659
22660  // (tbz (and x, m), b) -> (tbz x, b)
22661  case ISD::AND:
22662    if ((C->getZExtValue() >> Bit) & 1)
22663      return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
22664    return Op;
22665
22666  // (tbz (shl x, c), b) -> (tbz x, b-c)
22667  case ISD::SHL:
22668    if (C->getZExtValue() <= Bit &&
22669        (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
22670      Bit = Bit - C->getZExtValue();
22671      return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
22672    }
22673    return Op;
22674
22675  // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
22676  case ISD::SRA:
22677    Bit = Bit + C->getZExtValue();
22678    if (Bit >= Op->getValueType(0).getSizeInBits())
22679      Bit = Op->getValueType(0).getSizeInBits() - 1;
22680    return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
22681
22682  // (tbz (srl x, c), b) -> (tbz x, b+c)
22683  case ISD::SRL:
22684    if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
22685      Bit = Bit + C->getZExtValue();
22686      return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
22687    }
22688    return Op;
22689
22690  // (tbz (xor x, -1), b) -> (tbnz x, b)
22691  case ISD::XOR:
22692    if ((C->getZExtValue() >> Bit) & 1)
22693      Invert = !Invert;
22694    return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
22695  }
22696}
22697
22698// Optimize test single bit zero/non-zero and branch.
22699static SDValue performTBZCombine(SDNode *N,
22700                                 TargetLowering::DAGCombinerInfo &DCI,
22701                                 SelectionDAG &DAG) {
22702  unsigned Bit = N->getConstantOperandVal(2);
22703  bool Invert = false;
22704  SDValue TestSrc = N->getOperand(1);
22705  SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
22706
22707  if (TestSrc == NewTestSrc)
22708    return SDValue();
22709
22710  unsigned NewOpc = N->getOpcode();
22711  if (Invert) {
22712    if (NewOpc == AArch64ISD::TBZ)
22713      NewOpc = AArch64ISD::TBNZ;
22714    else {
22715      assert(NewOpc == AArch64ISD::TBNZ);
22716      NewOpc = AArch64ISD::TBZ;
22717    }
22718  }
22719
22720  SDLoc DL(N);
22721  return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
22722                     DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
22723}
22724
22725// Swap vselect operands where it may allow a predicated operation to achieve
22726// the `sel`.
22727//
22728//     (vselect (setcc ( condcode) (_) (_)) (a)          (op (a) (b)))
22729//  => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
22730static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG) {
22731  auto SelectA = N->getOperand(1);
22732  auto SelectB = N->getOperand(2);
22733  auto NTy = N->getValueType(0);
22734
22735  if (!NTy.isScalableVector())
22736    return SDValue();
22737  SDValue SetCC = N->getOperand(0);
22738  if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse())
22739    return SDValue();
22740
22741  switch (SelectB.getOpcode()) {
22742  default:
22743    return SDValue();
22744  case ISD::FMUL:
22745  case ISD::FSUB:
22746  case ISD::FADD:
22747    break;
22748  }
22749  if (SelectA != SelectB.getOperand(0))
22750    return SDValue();
22751
22752  ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
22753  ISD::CondCode InverseCC =
22754      ISD::getSetCCInverse(CC, SetCC.getOperand(0).getValueType());
22755  auto InverseSetCC =
22756      DAG.getSetCC(SDLoc(SetCC), SetCC.getValueType(), SetCC.getOperand(0),
22757                   SetCC.getOperand(1), InverseCC);
22758
22759  return DAG.getNode(ISD::VSELECT, SDLoc(N), NTy,
22760                     {InverseSetCC, SelectB, SelectA});
22761}
22762
22763// vselect (v1i1 setcc) ->
22764//     vselect (v1iXX setcc)  (XX is the size of the compared operand type)
22765// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
22766// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
22767// such VSELECT.
22768static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
22769  if (auto SwapResult = trySwapVSelectOperands(N, DAG))
22770    return SwapResult;
22771
22772  SDValue N0 = N->getOperand(0);
22773  EVT CCVT = N0.getValueType();
22774
22775  if (isAllActivePredicate(DAG, N0))
22776    return N->getOperand(1);
22777
22778  if (isAllInactivePredicate(N0))
22779    return N->getOperand(2);
22780
22781  // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
22782  // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
22783  // supported types.
22784  SDValue SetCC = N->getOperand(0);
22785  if (SetCC.getOpcode() == ISD::SETCC &&
22786      SetCC.getOperand(2) == DAG.getCondCode(ISD::SETGT)) {
22787    SDValue CmpLHS = SetCC.getOperand(0);
22788    EVT VT = CmpLHS.getValueType();
22789    SDNode *CmpRHS = SetCC.getOperand(1).getNode();
22790    SDNode *SplatLHS = N->getOperand(1).getNode();
22791    SDNode *SplatRHS = N->getOperand(2).getNode();
22792    APInt SplatLHSVal;
22793    if (CmpLHS.getValueType() == N->getOperand(1).getValueType() &&
22794        VT.isSimple() &&
22795        is_contained(ArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
22796                               MVT::v2i32, MVT::v4i32, MVT::v2i64}),
22797                     VT.getSimpleVT().SimpleTy) &&
22798        ISD::isConstantSplatVector(SplatLHS, SplatLHSVal) &&
22799        SplatLHSVal.isOne() && ISD::isConstantSplatVectorAllOnes(CmpRHS) &&
22800        ISD::isConstantSplatVectorAllOnes(SplatRHS)) {
22801      unsigned NumElts = VT.getVectorNumElements();
22802      SmallVector<SDValue, 8> Ops(
22803          NumElts, DAG.getConstant(VT.getScalarSizeInBits() - 1, SDLoc(N),
22804                                   VT.getScalarType()));
22805      SDValue Val = DAG.getBuildVector(VT, SDLoc(N), Ops);
22806
22807      auto Shift = DAG.getNode(ISD::SRA, SDLoc(N), VT, CmpLHS, Val);
22808      auto Or = DAG.getNode(ISD::OR, SDLoc(N), VT, Shift, N->getOperand(1));
22809      return Or;
22810    }
22811  }
22812
22813  EVT CmpVT = N0.getOperand(0).getValueType();
22814  if (N0.getOpcode() != ISD::SETCC ||
22815      CCVT.getVectorElementCount() != ElementCount::getFixed(1) ||
22816      CCVT.getVectorElementType() != MVT::i1 ||
22817      CmpVT.getVectorElementType().isFloatingPoint())
22818    return SDValue();
22819
22820  EVT ResVT = N->getValueType(0);
22821  // Only combine when the result type is of the same size as the compared
22822  // operands.
22823  if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
22824    return SDValue();
22825
22826  SDValue IfTrue = N->getOperand(1);
22827  SDValue IfFalse = N->getOperand(2);
22828  SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
22829                       N0.getOperand(0), N0.getOperand(1),
22830                       cast<CondCodeSDNode>(N0.getOperand(2))->get());
22831  return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
22832                     IfTrue, IfFalse);
22833}
22834
22835/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
22836/// the compare-mask instructions rather than going via NZCV, even if LHS and
22837/// RHS are really scalar. This replaces any scalar setcc in the above pattern
22838/// with a vector one followed by a DUP shuffle on the result.
22839static SDValue performSelectCombine(SDNode *N,
22840                                    TargetLowering::DAGCombinerInfo &DCI) {
22841  SelectionDAG &DAG = DCI.DAG;
22842  SDValue N0 = N->getOperand(0);
22843  EVT ResVT = N->getValueType(0);
22844
22845  if (N0.getOpcode() != ISD::SETCC)
22846    return SDValue();
22847
22848  if (ResVT.isScalableVT())
22849    return SDValue();
22850
22851  // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
22852  // scalar SetCCResultType. We also don't expect vectors, because we assume
22853  // that selects fed by vector SETCCs are canonicalized to VSELECT.
22854  assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
22855         "Scalar-SETCC feeding SELECT has unexpected result type!");
22856
22857  // If NumMaskElts == 0, the comparison is larger than select result. The
22858  // largest real NEON comparison is 64-bits per lane, which means the result is
22859  // at most 32-bits and an illegal vector. Just bail out for now.
22860  EVT SrcVT = N0.getOperand(0).getValueType();
22861
22862  // Don't try to do this optimization when the setcc itself has i1 operands.
22863  // There are no legal vectors of i1, so this would be pointless. v1f16 is
22864  // ruled out to prevent the creation of setcc that need to be scalarized.
22865  if (SrcVT == MVT::i1 ||
22866      (SrcVT.isFloatingPoint() && SrcVT.getSizeInBits() <= 16))
22867    return SDValue();
22868
22869  int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
22870  if (!ResVT.isVector() || NumMaskElts == 0)
22871    return SDValue();
22872
22873  SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
22874  EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
22875
22876  // Also bail out if the vector CCVT isn't the same size as ResVT.
22877  // This can happen if the SETCC operand size doesn't divide the ResVT size
22878  // (e.g., f64 vs v3f32).
22879  if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
22880    return SDValue();
22881
22882  // Make sure we didn't create illegal types, if we're not supposed to.
22883  assert(DCI.isBeforeLegalize() ||
22884         DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
22885
22886  // First perform a vector comparison, where lane 0 is the one we're interested
22887  // in.
22888  SDLoc DL(N0);
22889  SDValue LHS =
22890      DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
22891  SDValue RHS =
22892      DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
22893  SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
22894
22895  // Now duplicate the comparison mask we want across all other lanes.
22896  SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
22897  SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
22898  Mask = DAG.getNode(ISD::BITCAST, DL,
22899                     ResVT.changeVectorElementTypeToInteger(), Mask);
22900
22901  return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
22902}
22903
22904static SDValue performDUPCombine(SDNode *N,
22905                                 TargetLowering::DAGCombinerInfo &DCI) {
22906  EVT VT = N->getValueType(0);
22907  SDLoc DL(N);
22908  // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
22909  // 128bit vector version.
22910  if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
22911    EVT LVT = VT.getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
22912    SmallVector<SDValue> Ops(N->ops());
22913    if (SDNode *LN = DCI.DAG.getNodeIfExists(N->getOpcode(),
22914                                             DCI.DAG.getVTList(LVT), Ops)) {
22915      return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0),
22916                             DCI.DAG.getConstant(0, DL, MVT::i64));
22917    }
22918  }
22919
22920  if (N->getOpcode() == AArch64ISD::DUP) {
22921    if (DCI.isAfterLegalizeDAG()) {
22922      // If scalar dup's operand is extract_vector_elt, try to combine them into
22923      // duplane. For example,
22924      //
22925      //    t21: i32 = extract_vector_elt t19, Constant:i64<0>
22926      //  t18: v4i32 = AArch64ISD::DUP t21
22927      //  ==>
22928      //  t22: v4i32 = AArch64ISD::DUPLANE32 t19, Constant:i64<0>
22929      SDValue EXTRACT_VEC_ELT = N->getOperand(0);
22930      if (EXTRACT_VEC_ELT.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
22931        if (VT == EXTRACT_VEC_ELT.getOperand(0).getValueType()) {
22932          unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
22933          return DCI.DAG.getNode(Opcode, DL, VT, EXTRACT_VEC_ELT.getOperand(0),
22934                                 EXTRACT_VEC_ELT.getOperand(1));
22935        }
22936      }
22937    }
22938
22939    return performPostLD1Combine(N, DCI, false);
22940  }
22941
22942  return SDValue();
22943}
22944
22945/// Get rid of unnecessary NVCASTs (that don't change the type).
22946static SDValue performNVCASTCombine(SDNode *N) {
22947  if (N->getValueType(0) == N->getOperand(0).getValueType())
22948    return N->getOperand(0);
22949
22950  return SDValue();
22951}
22952
22953// If all users of the globaladdr are of the form (globaladdr + constant), find
22954// the smallest constant, fold it into the globaladdr's offset and rewrite the
22955// globaladdr as (globaladdr + constant) - constant.
22956static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
22957                                           const AArch64Subtarget *Subtarget,
22958                                           const TargetMachine &TM) {
22959  auto *GN = cast<GlobalAddressSDNode>(N);
22960  if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
22961      AArch64II::MO_NO_FLAG)
22962    return SDValue();
22963
22964  uint64_t MinOffset = -1ull;
22965  for (SDNode *N : GN->uses()) {
22966    if (N->getOpcode() != ISD::ADD)
22967      return SDValue();
22968    auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
22969    if (!C)
22970      C = dyn_cast<ConstantSDNode>(N->getOperand(1));
22971    if (!C)
22972      return SDValue();
22973    MinOffset = std::min(MinOffset, C->getZExtValue());
22974  }
22975  uint64_t Offset = MinOffset + GN->getOffset();
22976
22977  // Require that the new offset is larger than the existing one. Otherwise, we
22978  // can end up oscillating between two possible DAGs, for example,
22979  // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
22980  if (Offset <= uint64_t(GN->getOffset()))
22981    return SDValue();
22982
22983  // Check whether folding this offset is legal. It must not go out of bounds of
22984  // the referenced object to avoid violating the code model, and must be
22985  // smaller than 2^20 because this is the largest offset expressible in all
22986  // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
22987  // stores an immediate signed 21 bit offset.)
22988  //
22989  // This check also prevents us from folding negative offsets, which will end
22990  // up being treated in the same way as large positive ones. They could also
22991  // cause code model violations, and aren't really common enough to matter.
22992  if (Offset >= (1 << 20))
22993    return SDValue();
22994
22995  const GlobalValue *GV = GN->getGlobal();
22996  Type *T = GV->getValueType();
22997  if (!T->isSized() ||
22998      Offset > GV->getParent()->getDataLayout().getTypeAllocSize(T))
22999    return SDValue();
23000
23001  SDLoc DL(GN);
23002  SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
23003  return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
23004                     DAG.getConstant(MinOffset, DL, MVT::i64));
23005}
23006
23007static SDValue performCTLZCombine(SDNode *N, SelectionDAG &DAG,
23008                                  const AArch64Subtarget *Subtarget) {
23009  SDValue BR = N->getOperand(0);
23010  if (!Subtarget->hasCSSC() || BR.getOpcode() != ISD::BITREVERSE ||
23011      !BR.getValueType().isScalarInteger())
23012    return SDValue();
23013
23014  SDLoc DL(N);
23015  return DAG.getNode(ISD::CTTZ, DL, BR.getValueType(), BR.getOperand(0));
23016}
23017
23018// Turns the vector of indices into a vector of byte offstes by scaling Offset
23019// by (BitWidth / 8).
23020static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset,
23021                                          SDLoc DL, unsigned BitWidth) {
23022  assert(Offset.getValueType().isScalableVector() &&
23023         "This method is only for scalable vectors of offsets");
23024
23025  SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
23026  SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
23027
23028  return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
23029}
23030
23031/// Check if the value of \p OffsetInBytes can be used as an immediate for
23032/// the gather load/prefetch and scatter store instructions with vector base and
23033/// immediate offset addressing mode:
23034///
23035///      [<Zn>.[S|D]{, #<imm>}]
23036///
23037/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
23038inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
23039                                                  unsigned ScalarSizeInBytes) {
23040  // The immediate is not a multiple of the scalar size.
23041  if (OffsetInBytes % ScalarSizeInBytes)
23042    return false;
23043
23044  // The immediate is out of range.
23045  if (OffsetInBytes / ScalarSizeInBytes > 31)
23046    return false;
23047
23048  return true;
23049}
23050
23051/// Check if the value of \p Offset represents a valid immediate for the SVE
23052/// gather load/prefetch and scatter store instructiona with vector base and
23053/// immediate offset addressing mode:
23054///
23055///      [<Zn>.[S|D]{, #<imm>}]
23056///
23057/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
23058static bool isValidImmForSVEVecImmAddrMode(SDValue Offset,
23059                                           unsigned ScalarSizeInBytes) {
23060  ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
23061  return OffsetConst && isValidImmForSVEVecImmAddrMode(
23062                            OffsetConst->getZExtValue(), ScalarSizeInBytes);
23063}
23064
23065static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
23066                                          unsigned Opcode,
23067                                          bool OnlyPackedOffsets = true) {
23068  const SDValue Src = N->getOperand(2);
23069  const EVT SrcVT = Src->getValueType(0);
23070  assert(SrcVT.isScalableVector() &&
23071         "Scatter stores are only possible for SVE vectors");
23072
23073  SDLoc DL(N);
23074  MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
23075
23076  // Make sure that source data will fit into an SVE register
23077  if (SrcVT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock)
23078    return SDValue();
23079
23080  // For FPs, ACLE only supports _packed_ single and double precision types.
23081  // SST1Q_[INDEX_]PRED is the ST1Q for sve2p1 and should allow all sizes.
23082  if (SrcElVT.isFloatingPoint())
23083    if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) &&
23084        ((Opcode != AArch64ISD::SST1Q_PRED &&
23085          Opcode != AArch64ISD::SST1Q_INDEX_PRED) ||
23086         ((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16))))
23087      return SDValue();
23088
23089  // Depending on the addressing mode, this is either a pointer or a vector of
23090  // pointers (that fits into one register)
23091  SDValue Base = N->getOperand(4);
23092  // Depending on the addressing mode, this is either a single offset or a
23093  // vector of offsets  (that fits into one register)
23094  SDValue Offset = N->getOperand(5);
23095
23096  // For "scalar + vector of indices", just scale the indices. This only
23097  // applies to non-temporal scatters because there's no instruction that takes
23098  // indicies.
23099  if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
23100    Offset =
23101        getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits());
23102    Opcode = AArch64ISD::SSTNT1_PRED;
23103  } else if (Opcode == AArch64ISD::SST1Q_INDEX_PRED) {
23104    Offset =
23105        getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits());
23106    Opcode = AArch64ISD::SST1Q_PRED;
23107  }
23108
23109  // In the case of non-temporal gather loads there's only one SVE instruction
23110  // per data-size: "scalar + vector", i.e.
23111  //    * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
23112  // Since we do have intrinsics that allow the arguments to be in a different
23113  // order, we may need to swap them to match the spec.
23114  if ((Opcode == AArch64ISD::SSTNT1_PRED || Opcode == AArch64ISD::SST1Q_PRED) &&
23115      Offset.getValueType().isVector())
23116    std::swap(Base, Offset);
23117
23118  // SST1_IMM requires that the offset is an immediate that is:
23119  //    * a multiple of #SizeInBytes,
23120  //    * in the range [0, 31 x #SizeInBytes],
23121  // where #SizeInBytes is the size in bytes of the stored items. For
23122  // immediates outside that range and non-immediate scalar offsets use SST1 or
23123  // SST1_UXTW instead.
23124  if (Opcode == AArch64ISD::SST1_IMM_PRED) {
23125    if (!isValidImmForSVEVecImmAddrMode(Offset,
23126                                        SrcVT.getScalarSizeInBits() / 8)) {
23127      if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
23128        Opcode = AArch64ISD::SST1_UXTW_PRED;
23129      else
23130        Opcode = AArch64ISD::SST1_PRED;
23131
23132      std::swap(Base, Offset);
23133    }
23134  }
23135
23136  auto &TLI = DAG.getTargetLoweringInfo();
23137  if (!TLI.isTypeLegal(Base.getValueType()))
23138    return SDValue();
23139
23140  // Some scatter store variants allow unpacked offsets, but only as nxv2i32
23141  // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
23142  // nxv2i64. Legalize accordingly.
23143  if (!OnlyPackedOffsets &&
23144      Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
23145    Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
23146
23147  if (!TLI.isTypeLegal(Offset.getValueType()))
23148    return SDValue();
23149
23150  // Source value type that is representable in hardware
23151  EVT HwSrcVt = getSVEContainerType(SrcVT);
23152
23153  // Keep the original type of the input data to store - this is needed to be
23154  // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
23155  // FP values we want the integer equivalent, so just use HwSrcVt.
23156  SDValue InputVT = DAG.getValueType(SrcVT);
23157  if (SrcVT.isFloatingPoint())
23158    InputVT = DAG.getValueType(HwSrcVt);
23159
23160  SDVTList VTs = DAG.getVTList(MVT::Other);
23161  SDValue SrcNew;
23162
23163  if (Src.getValueType().isFloatingPoint())
23164    SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
23165  else
23166    SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);
23167
23168  SDValue Ops[] = {N->getOperand(0), // Chain
23169                   SrcNew,
23170                   N->getOperand(3), // Pg
23171                   Base,
23172                   Offset,
23173                   InputVT};
23174
23175  return DAG.getNode(Opcode, DL, VTs, Ops);
23176}
23177
23178static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
23179                                        unsigned Opcode,
23180                                        bool OnlyPackedOffsets = true) {
23181  const EVT RetVT = N->getValueType(0);
23182  assert(RetVT.isScalableVector() &&
23183         "Gather loads are only possible for SVE vectors");
23184
23185  SDLoc DL(N);
23186
23187  // Make sure that the loaded data will fit into an SVE register
23188  if (RetVT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock)
23189    return SDValue();
23190
23191  // Depending on the addressing mode, this is either a pointer or a vector of
23192  // pointers (that fits into one register)
23193  SDValue Base = N->getOperand(3);
23194  // Depending on the addressing mode, this is either a single offset or a
23195  // vector of offsets  (that fits into one register)
23196  SDValue Offset = N->getOperand(4);
23197
23198  // For "scalar + vector of indices", scale the indices to obtain unscaled
23199  // offsets. This applies to non-temporal and quadword gathers, which do not
23200  // have an addressing mode with scaled offset.
23201  if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) {
23202    Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
23203                                        RetVT.getScalarSizeInBits());
23204    Opcode = AArch64ISD::GLDNT1_MERGE_ZERO;
23205  } else if (Opcode == AArch64ISD::GLD1Q_INDEX_MERGE_ZERO) {
23206    Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
23207                                        RetVT.getScalarSizeInBits());
23208    Opcode = AArch64ISD::GLD1Q_MERGE_ZERO;
23209  }
23210
23211  // In the case of non-temporal gather loads and quadword gather loads there's
23212  // only one addressing mode : "vector + scalar", e.g.
23213  //   ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
23214  // Since we do have intrinsics that allow the arguments to be in a different
23215  // order, we may need to swap them to match the spec.
23216  if ((Opcode == AArch64ISD::GLDNT1_MERGE_ZERO ||
23217       Opcode == AArch64ISD::GLD1Q_MERGE_ZERO) &&
23218      Offset.getValueType().isVector())
23219    std::swap(Base, Offset);
23220
23221  // GLD{FF}1_IMM requires that the offset is an immediate that is:
23222  //    * a multiple of #SizeInBytes,
23223  //    * in the range [0, 31 x #SizeInBytes],
23224  // where #SizeInBytes is the size in bytes of the loaded items. For
23225  // immediates outside that range and non-immediate scalar offsets use
23226  // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
23227  if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
23228      Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) {
23229    if (!isValidImmForSVEVecImmAddrMode(Offset,
23230                                        RetVT.getScalarSizeInBits() / 8)) {
23231      if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
23232        Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
23233                     ? AArch64ISD::GLD1_UXTW_MERGE_ZERO
23234                     : AArch64ISD::GLDFF1_UXTW_MERGE_ZERO;
23235      else
23236        Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
23237                     ? AArch64ISD::GLD1_MERGE_ZERO
23238                     : AArch64ISD::GLDFF1_MERGE_ZERO;
23239
23240      std::swap(Base, Offset);
23241    }
23242  }
23243
23244  auto &TLI = DAG.getTargetLoweringInfo();
23245  if (!TLI.isTypeLegal(Base.getValueType()))
23246    return SDValue();
23247
23248  // Some gather load variants allow unpacked offsets, but only as nxv2i32
23249  // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
23250  // nxv2i64. Legalize accordingly.
23251  if (!OnlyPackedOffsets &&
23252      Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
23253    Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
23254
23255  // Return value type that is representable in hardware
23256  EVT HwRetVt = getSVEContainerType(RetVT);
23257
23258  // Keep the original output value type around - this is needed to be able to
23259  // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
23260  // values we want the integer equivalent, so just use HwRetVT.
23261  SDValue OutVT = DAG.getValueType(RetVT);
23262  if (RetVT.isFloatingPoint())
23263    OutVT = DAG.getValueType(HwRetVt);
23264
23265  SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
23266  SDValue Ops[] = {N->getOperand(0), // Chain
23267                   N->getOperand(2), // Pg
23268                   Base, Offset, OutVT};
23269
23270  SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
23271  SDValue LoadChain = SDValue(Load.getNode(), 1);
23272
23273  if (RetVT.isInteger() && (RetVT != HwRetVt))
23274    Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
23275
23276  // If the original return value was FP, bitcast accordingly. Doing it here
23277  // means that we can avoid adding TableGen patterns for FPs.
23278  if (RetVT.isFloatingPoint())
23279    Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
23280
23281  return DAG.getMergeValues({Load, LoadChain}, DL);
23282}
23283
23284static SDValue
23285performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
23286                              SelectionDAG &DAG) {
23287  SDLoc DL(N);
23288  SDValue Src = N->getOperand(0);
23289  unsigned Opc = Src->getOpcode();
23290
23291  // Sign extend of an unsigned unpack -> signed unpack
23292  if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
23293
23294    unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
23295                                               : AArch64ISD::SUNPKLO;
23296
23297    // Push the sign extend to the operand of the unpack
23298    // This is necessary where, for example, the operand of the unpack
23299    // is another unpack:
23300    // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
23301    // ->
23302    // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
23303    // ->
23304    // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
23305    SDValue ExtOp = Src->getOperand(0);
23306    auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
23307    EVT EltTy = VT.getVectorElementType();
23308    (void)EltTy;
23309
23310    assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
23311           "Sign extending from an invalid type");
23312
23313    EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());
23314
23315    SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ExtOp.getValueType(),
23316                              ExtOp, DAG.getValueType(ExtVT));
23317
23318    return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
23319  }
23320
23321  if (DCI.isBeforeLegalizeOps())
23322    return SDValue();
23323
23324  if (!EnableCombineMGatherIntrinsics)
23325    return SDValue();
23326
23327  // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
23328  // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
23329  unsigned NewOpc;
23330  unsigned MemVTOpNum = 4;
23331  switch (Opc) {
23332  case AArch64ISD::LD1_MERGE_ZERO:
23333    NewOpc = AArch64ISD::LD1S_MERGE_ZERO;
23334    MemVTOpNum = 3;
23335    break;
23336  case AArch64ISD::LDNF1_MERGE_ZERO:
23337    NewOpc = AArch64ISD::LDNF1S_MERGE_ZERO;
23338    MemVTOpNum = 3;
23339    break;
23340  case AArch64ISD::LDFF1_MERGE_ZERO:
23341    NewOpc = AArch64ISD::LDFF1S_MERGE_ZERO;
23342    MemVTOpNum = 3;
23343    break;
23344  case AArch64ISD::GLD1_MERGE_ZERO:
23345    NewOpc = AArch64ISD::GLD1S_MERGE_ZERO;
23346    break;
23347  case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
23348    NewOpc = AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
23349    break;
23350  case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
23351    NewOpc = AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
23352    break;
23353  case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
23354    NewOpc = AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
23355    break;
23356  case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
23357    NewOpc = AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
23358    break;
23359  case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
23360    NewOpc = AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
23361    break;
23362  case AArch64ISD::GLD1_IMM_MERGE_ZERO:
23363    NewOpc = AArch64ISD::GLD1S_IMM_MERGE_ZERO;
23364    break;
23365  case AArch64ISD::GLDFF1_MERGE_ZERO:
23366    NewOpc = AArch64ISD::GLDFF1S_MERGE_ZERO;
23367    break;
23368  case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
23369    NewOpc = AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO;
23370    break;
23371  case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
23372    NewOpc = AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO;
23373    break;
23374  case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
23375    NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO;
23376    break;
23377  case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
23378    NewOpc = AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO;
23379    break;
23380  case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
23381    NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO;
23382    break;
23383  case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
23384    NewOpc = AArch64ISD::GLDFF1S_IMM_MERGE_ZERO;
23385    break;
23386  case AArch64ISD::GLDNT1_MERGE_ZERO:
23387    NewOpc = AArch64ISD::GLDNT1S_MERGE_ZERO;
23388    break;
23389  default:
23390    return SDValue();
23391  }
23392
23393  EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
23394  EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();
23395
23396  if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
23397    return SDValue();
23398
23399  EVT DstVT = N->getValueType(0);
23400  SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
23401
23402  SmallVector<SDValue, 5> Ops;
23403  for (unsigned I = 0; I < Src->getNumOperands(); ++I)
23404    Ops.push_back(Src->getOperand(I));
23405
23406  SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
23407  DCI.CombineTo(N, ExtLoad);
23408  DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));
23409
23410  // Return N so it doesn't get rechecked
23411  return SDValue(N, 0);
23412}
23413
23414/// Legalize the gather prefetch (scalar + vector addressing mode) when the
23415/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
23416/// != nxv2i32) do not need legalization.
23417static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG) {
23418  const unsigned OffsetPos = 4;
23419  SDValue Offset = N->getOperand(OffsetPos);
23420
23421  // Not an unpacked vector, bail out.
23422  if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
23423    return SDValue();
23424
23425  // Extend the unpacked offset vector to 64-bit lanes.
23426  SDLoc DL(N);
23427  Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
23428  SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
23429  // Replace the offset operand with the 64-bit one.
23430  Ops[OffsetPos] = Offset;
23431
23432  return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
23433}
23434
23435/// Combines a node carrying the intrinsic
23436/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
23437/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
23438/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
23439/// sve gather prefetch instruction with vector plus immediate addressing mode.
23440static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG,
23441                                               unsigned ScalarSizeInBytes) {
23442  const unsigned ImmPos = 4, OffsetPos = 3;
23443  // No need to combine the node if the immediate is valid...
23444  if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
23445    return SDValue();
23446
23447  // ...otherwise swap the offset base with the offset...
23448  SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
23449  std::swap(Ops[ImmPos], Ops[OffsetPos]);
23450  // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
23451  // `aarch64_sve_prfb_gather_uxtw_index`.
23452  SDLoc DL(N);
23453  Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
23454                           MVT::i64);
23455
23456  return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
23457}
23458
23459// Return true if the vector operation can guarantee only the first lane of its
23460// result contains data, with all bits in other lanes set to zero.
23461static bool isLanes1toNKnownZero(SDValue Op) {
23462  switch (Op.getOpcode()) {
23463  default:
23464    return false;
23465  case AArch64ISD::ANDV_PRED:
23466  case AArch64ISD::EORV_PRED:
23467  case AArch64ISD::FADDA_PRED:
23468  case AArch64ISD::FADDV_PRED:
23469  case AArch64ISD::FMAXNMV_PRED:
23470  case AArch64ISD::FMAXV_PRED:
23471  case AArch64ISD::FMINNMV_PRED:
23472  case AArch64ISD::FMINV_PRED:
23473  case AArch64ISD::ORV_PRED:
23474  case AArch64ISD::SADDV_PRED:
23475  case AArch64ISD::SMAXV_PRED:
23476  case AArch64ISD::SMINV_PRED:
23477  case AArch64ISD::UADDV_PRED:
23478  case AArch64ISD::UMAXV_PRED:
23479  case AArch64ISD::UMINV_PRED:
23480    return true;
23481  }
23482}
23483
23484static SDValue removeRedundantInsertVectorElt(SDNode *N) {
23485  assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
23486  SDValue InsertVec = N->getOperand(0);
23487  SDValue InsertElt = N->getOperand(1);
23488  SDValue InsertIdx = N->getOperand(2);
23489
23490  // We only care about inserts into the first element...
23491  if (!isNullConstant(InsertIdx))
23492    return SDValue();
23493  // ...of a zero'd vector...
23494  if (!ISD::isConstantSplatVectorAllZeros(InsertVec.getNode()))
23495    return SDValue();
23496  // ...where the inserted data was previously extracted...
23497  if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
23498    return SDValue();
23499
23500  SDValue ExtractVec = InsertElt.getOperand(0);
23501  SDValue ExtractIdx = InsertElt.getOperand(1);
23502
23503  // ...from the first element of a vector.
23504  if (!isNullConstant(ExtractIdx))
23505    return SDValue();
23506
23507  // If we get here we are effectively trying to zero lanes 1-N of a vector.
23508
23509  // Ensure there's no type conversion going on.
23510  if (N->getValueType(0) != ExtractVec.getValueType())
23511    return SDValue();
23512
23513  if (!isLanes1toNKnownZero(ExtractVec))
23514    return SDValue();
23515
23516  // The explicit zeroing is redundant.
23517  return ExtractVec;
23518}
23519
23520static SDValue
23521performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
23522  if (SDValue Res = removeRedundantInsertVectorElt(N))
23523    return Res;
23524
23525  return performPostLD1Combine(N, DCI, true);
23526}
23527
23528static SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG) {
23529  EVT Ty = N->getValueType(0);
23530  if (Ty.isInteger())
23531    return SDValue();
23532
23533  EVT IntTy = Ty.changeVectorElementTypeToInteger();
23534  EVT ExtIntTy = getPackedSVEVectorVT(IntTy.getVectorElementCount());
23535  if (ExtIntTy.getVectorElementType().getScalarSizeInBits() <
23536      IntTy.getVectorElementType().getScalarSizeInBits())
23537    return SDValue();
23538
23539  SDLoc DL(N);
23540  SDValue LHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(0)),
23541                                     DL, ExtIntTy);
23542  SDValue RHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(1)),
23543                                     DL, ExtIntTy);
23544  SDValue Idx = N->getOperand(2);
23545  SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ExtIntTy, LHS, RHS, Idx);
23546  SDValue Trunc = DAG.getAnyExtOrTrunc(Splice, DL, IntTy);
23547  return DAG.getBitcast(Ty, Trunc);
23548}
23549
23550static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG,
23551                                      TargetLowering::DAGCombinerInfo &DCI,
23552                                      const AArch64Subtarget *Subtarget) {
23553  SDValue N0 = N->getOperand(0);
23554  EVT VT = N->getValueType(0);
23555
23556  // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
23557  if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::FP_ROUND)
23558    return SDValue();
23559
23560  auto hasValidElementTypeForFPExtLoad = [](EVT VT) {
23561    EVT EltVT = VT.getVectorElementType();
23562    return EltVT == MVT::f32 || EltVT == MVT::f64;
23563  };
23564
23565  // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
23566  // We purposefully don't care about legality of the nodes here as we know
23567  // they can be split down into something legal.
23568  if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) &&
23569      N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
23570      VT.isFixedLengthVector() && hasValidElementTypeForFPExtLoad(VT) &&
23571      VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {
23572    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
23573    SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
23574                                     LN0->getChain(), LN0->getBasePtr(),
23575                                     N0.getValueType(), LN0->getMemOperand());
23576    DCI.CombineTo(N, ExtLoad);
23577    DCI.CombineTo(
23578        N0.getNode(),
23579        DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), ExtLoad,
23580                    DAG.getIntPtrConstant(1, SDLoc(N0), /*isTarget=*/true)),
23581        ExtLoad.getValue(1));
23582    return SDValue(N, 0); // Return N so it doesn't get rechecked!
23583  }
23584
23585  return SDValue();
23586}
23587
23588static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG,
23589                                      const AArch64Subtarget *Subtarget) {
23590  EVT VT = N->getValueType(0);
23591
23592  // Don't expand for NEON, SVE2 or SME
23593  if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME())
23594    return SDValue();
23595
23596  SDLoc DL(N);
23597
23598  SDValue Mask = N->getOperand(0);
23599  SDValue In1 = N->getOperand(1);
23600  SDValue In2 = N->getOperand(2);
23601
23602  SDValue InvMask = DAG.getNOT(DL, Mask, VT);
23603  SDValue Sel = DAG.getNode(ISD::AND, DL, VT, Mask, In1);
23604  SDValue SelInv = DAG.getNode(ISD::AND, DL, VT, InvMask, In2);
23605  return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv);
23606}
23607
23608static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG) {
23609  EVT VT = N->getValueType(0);
23610
23611  SDValue Insert = N->getOperand(0);
23612  if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
23613    return SDValue();
23614
23615  if (!Insert.getOperand(0).isUndef())
23616    return SDValue();
23617
23618  uint64_t IdxInsert = Insert.getConstantOperandVal(2);
23619  uint64_t IdxDupLane = N->getConstantOperandVal(1);
23620  if (IdxInsert != 0 || IdxDupLane != 0)
23621    return SDValue();
23622
23623  SDValue Bitcast = Insert.getOperand(1);
23624  if (Bitcast.getOpcode() != ISD::BITCAST)
23625    return SDValue();
23626
23627  SDValue Subvec = Bitcast.getOperand(0);
23628  EVT SubvecVT = Subvec.getValueType();
23629  if (!SubvecVT.is128BitVector())
23630    return SDValue();
23631  EVT NewSubvecVT =
23632      getPackedSVEVectorVT(Subvec.getValueType().getVectorElementType());
23633
23634  SDLoc DL(N);
23635  SDValue NewInsert =
23636      DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT,
23637                  DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2));
23638  SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT,
23639                                      NewInsert, N->getOperand(1));
23640  return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128);
23641}
23642
23643// Try to combine mull with uzp1.
23644static SDValue tryCombineMULLWithUZP1(SDNode *N,
23645                                      TargetLowering::DAGCombinerInfo &DCI,
23646                                      SelectionDAG &DAG) {
23647  if (DCI.isBeforeLegalizeOps())
23648    return SDValue();
23649
23650  SDValue LHS = N->getOperand(0);
23651  SDValue RHS = N->getOperand(1);
23652
23653  SDValue ExtractHigh;
23654  SDValue ExtractLow;
23655  SDValue TruncHigh;
23656  SDValue TruncLow;
23657  SDLoc DL(N);
23658
23659  // Check the operands are trunc and extract_high.
23660  if (isEssentiallyExtractHighSubvector(LHS) &&
23661      RHS.getOpcode() == ISD::TRUNCATE) {
23662    TruncHigh = RHS;
23663    if (LHS.getOpcode() == ISD::BITCAST)
23664      ExtractHigh = LHS.getOperand(0);
23665    else
23666      ExtractHigh = LHS;
23667  } else if (isEssentiallyExtractHighSubvector(RHS) &&
23668             LHS.getOpcode() == ISD::TRUNCATE) {
23669    TruncHigh = LHS;
23670    if (LHS.getOpcode() == ISD::BITCAST)
23671      ExtractHigh = RHS.getOperand(0);
23672    else
23673      ExtractHigh = RHS;
23674  } else
23675    return SDValue();
23676
23677  // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
23678  // with uzp1.
23679  // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
23680  SDValue TruncHighOp = TruncHigh.getOperand(0);
23681  EVT TruncHighOpVT = TruncHighOp.getValueType();
23682  if (TruncHighOp.getOpcode() == AArch64ISD::DUP ||
23683      DAG.isSplatValue(TruncHighOp, false))
23684    return SDValue();
23685
23686  // Check there is other extract_high with same source vector.
23687  // For example,
23688  //
23689  //    t18: v4i16 = extract_subvector t2, Constant:i64<0>
23690  //    t12: v4i16 = truncate t11
23691  //  t31: v4i32 = AArch64ISD::SMULL t18, t12
23692  //    t23: v4i16 = extract_subvector t2, Constant:i64<4>
23693  //    t16: v4i16 = truncate t15
23694  //  t30: v4i32 = AArch64ISD::SMULL t23, t1
23695  //
23696  // This dagcombine assumes the two extract_high uses same source vector in
23697  // order to detect the pair of the mull. If they have different source vector,
23698  // this code will not work.
23699  bool HasFoundMULLow = true;
23700  SDValue ExtractHighSrcVec = ExtractHigh.getOperand(0);
23701  if (ExtractHighSrcVec->use_size() != 2)
23702    HasFoundMULLow = false;
23703
23704  // Find ExtractLow.
23705  for (SDNode *User : ExtractHighSrcVec.getNode()->uses()) {
23706    if (User == ExtractHigh.getNode())
23707      continue;
23708
23709    if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
23710        !isNullConstant(User->getOperand(1))) {
23711      HasFoundMULLow = false;
23712      break;
23713    }
23714
23715    ExtractLow.setNode(User);
23716  }
23717
23718  if (!ExtractLow || !ExtractLow->hasOneUse())
23719    HasFoundMULLow = false;
23720
23721  // Check ExtractLow's user.
23722  if (HasFoundMULLow) {
23723    SDNode *ExtractLowUser = *ExtractLow.getNode()->use_begin();
23724    if (ExtractLowUser->getOpcode() != N->getOpcode()) {
23725      HasFoundMULLow = false;
23726    } else {
23727      if (ExtractLowUser->getOperand(0) == ExtractLow) {
23728        if (ExtractLowUser->getOperand(1).getOpcode() == ISD::TRUNCATE)
23729          TruncLow = ExtractLowUser->getOperand(1);
23730        else
23731          HasFoundMULLow = false;
23732      } else {
23733        if (ExtractLowUser->getOperand(0).getOpcode() == ISD::TRUNCATE)
23734          TruncLow = ExtractLowUser->getOperand(0);
23735        else
23736          HasFoundMULLow = false;
23737      }
23738    }
23739  }
23740
23741  // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
23742  // with uzp1.
23743  // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
23744  EVT TruncHighVT = TruncHigh.getValueType();
23745  EVT UZP1VT = TruncHighVT.getDoubleNumVectorElementsVT(*DAG.getContext());
23746  SDValue TruncLowOp =
23747      HasFoundMULLow ? TruncLow.getOperand(0) : DAG.getUNDEF(UZP1VT);
23748  EVT TruncLowOpVT = TruncLowOp.getValueType();
23749  if (HasFoundMULLow && (TruncLowOp.getOpcode() == AArch64ISD::DUP ||
23750                         DAG.isSplatValue(TruncLowOp, false)))
23751    return SDValue();
23752
23753  // Create uzp1, extract_high and extract_low.
23754  if (TruncHighOpVT != UZP1VT)
23755    TruncHighOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncHighOp);
23756  if (TruncLowOpVT != UZP1VT)
23757    TruncLowOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncLowOp);
23758
23759  SDValue UZP1 =
23760      DAG.getNode(AArch64ISD::UZP1, DL, UZP1VT, TruncLowOp, TruncHighOp);
23761  SDValue HighIdxCst =
23762      DAG.getConstant(TruncHighVT.getVectorNumElements(), DL, MVT::i64);
23763  SDValue NewTruncHigh =
23764      DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncHighVT, UZP1, HighIdxCst);
23765  DAG.ReplaceAllUsesWith(TruncHigh, NewTruncHigh);
23766
23767  if (HasFoundMULLow) {
23768    EVT TruncLowVT = TruncLow.getValueType();
23769    SDValue NewTruncLow = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncLowVT,
23770                                      UZP1, ExtractLow.getOperand(1));
23771    DAG.ReplaceAllUsesWith(TruncLow, NewTruncLow);
23772  }
23773
23774  return SDValue(N, 0);
23775}
23776
23777static SDValue performMULLCombine(SDNode *N,
23778                                  TargetLowering::DAGCombinerInfo &DCI,
23779                                  SelectionDAG &DAG) {
23780  if (SDValue Val =
23781          tryCombineLongOpWithDup(Intrinsic::not_intrinsic, N, DCI, DAG))
23782    return Val;
23783
23784  if (SDValue Val = tryCombineMULLWithUZP1(N, DCI, DAG))
23785    return Val;
23786
23787  return SDValue();
23788}
23789
23790static SDValue
23791performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
23792                             SelectionDAG &DAG) {
23793  // Let's do below transform.
23794  //
23795  //         t34: v4i32 = AArch64ISD::UADDLV t2
23796  //       t35: i32 = extract_vector_elt t34, Constant:i64<0>
23797  //     t7: i64 = zero_extend t35
23798  //   t20: v1i64 = scalar_to_vector t7
23799  // ==>
23800  //      t34: v4i32 = AArch64ISD::UADDLV t2
23801  //    t39: v2i32 = extract_subvector t34, Constant:i64<0>
23802  //  t40: v1i64 = AArch64ISD::NVCAST t39
23803  if (DCI.isBeforeLegalizeOps())
23804    return SDValue();
23805
23806  EVT VT = N->getValueType(0);
23807  if (VT != MVT::v1i64)
23808    return SDValue();
23809
23810  SDValue ZEXT = N->getOperand(0);
23811  if (ZEXT.getOpcode() != ISD::ZERO_EXTEND || ZEXT.getValueType() != MVT::i64)
23812    return SDValue();
23813
23814  SDValue EXTRACT_VEC_ELT = ZEXT.getOperand(0);
23815  if (EXTRACT_VEC_ELT.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
23816      EXTRACT_VEC_ELT.getValueType() != MVT::i32)
23817    return SDValue();
23818
23819  if (!isNullConstant(EXTRACT_VEC_ELT.getOperand(1)))
23820    return SDValue();
23821
23822  SDValue UADDLV = EXTRACT_VEC_ELT.getOperand(0);
23823  if (UADDLV.getOpcode() != AArch64ISD::UADDLV ||
23824      UADDLV.getValueType() != MVT::v4i32 ||
23825      UADDLV.getOperand(0).getValueType() != MVT::v8i8)
23826    return SDValue();
23827
23828  // Let's generate new sequence with AArch64ISD::NVCAST.
23829  SDLoc DL(N);
23830  SDValue EXTRACT_SUBVEC =
23831      DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, UADDLV,
23832                  DAG.getConstant(0, DL, MVT::i64));
23833  SDValue NVCAST =
23834      DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, EXTRACT_SUBVEC);
23835
23836  return NVCAST;
23837}
23838
23839SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
23840                                                 DAGCombinerInfo &DCI) const {
23841  SelectionDAG &DAG = DCI.DAG;
23842  switch (N->getOpcode()) {
23843  default:
23844    LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
23845    break;
23846  case ISD::VECREDUCE_AND:
23847  case ISD::VECREDUCE_OR:
23848  case ISD::VECREDUCE_XOR:
23849    return performVecReduceBitwiseCombine(N, DCI, DAG);
23850  case ISD::ADD:
23851  case ISD::SUB:
23852    return performAddSubCombine(N, DCI);
23853  case ISD::BUILD_VECTOR:
23854    return performBuildVectorCombine(N, DCI, DAG);
23855  case ISD::TRUNCATE:
23856    return performTruncateCombine(N, DAG);
23857  case AArch64ISD::ANDS:
23858    return performFlagSettingCombine(N, DCI, ISD::AND);
23859  case AArch64ISD::ADC:
23860    if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
23861      return R;
23862    return foldADCToCINC(N, DAG);
23863  case AArch64ISD::SBC:
23864    return foldOverflowCheck(N, DAG, /* IsAdd */ false);
23865  case AArch64ISD::ADCS:
23866    if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
23867      return R;
23868    return performFlagSettingCombine(N, DCI, AArch64ISD::ADC);
23869  case AArch64ISD::SBCS:
23870    if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false))
23871      return R;
23872    return performFlagSettingCombine(N, DCI, AArch64ISD::SBC);
23873  case ISD::XOR:
23874    return performXorCombine(N, DAG, DCI, Subtarget);
23875  case ISD::MUL:
23876    return performMulCombine(N, DAG, DCI, Subtarget);
23877  case ISD::SINT_TO_FP:
23878  case ISD::UINT_TO_FP:
23879    return performIntToFpCombine(N, DAG, Subtarget);
23880  case ISD::FP_TO_SINT:
23881  case ISD::FP_TO_UINT:
23882  case ISD::FP_TO_SINT_SAT:
23883  case ISD::FP_TO_UINT_SAT:
23884    return performFpToIntCombine(N, DAG, DCI, Subtarget);
23885  case ISD::FDIV:
23886    return performFDivCombine(N, DAG, DCI, Subtarget);
23887  case ISD::OR:
23888    return performORCombine(N, DCI, Subtarget, *this);
23889  case ISD::AND:
23890    return performANDCombine(N, DCI);
23891  case ISD::FADD:
23892    return performFADDCombine(N, DCI);
23893  case ISD::INTRINSIC_WO_CHAIN:
23894    return performIntrinsicCombine(N, DCI, Subtarget);
23895  case ISD::ANY_EXTEND:
23896  case ISD::ZERO_EXTEND:
23897  case ISD::SIGN_EXTEND:
23898    return performExtendCombine(N, DCI, DAG);
23899  case ISD::SIGN_EXTEND_INREG:
23900    return performSignExtendInRegCombine(N, DCI, DAG);
23901  case ISD::CONCAT_VECTORS:
23902    return performConcatVectorsCombine(N, DCI, DAG);
23903  case ISD::EXTRACT_SUBVECTOR:
23904    return performExtractSubvectorCombine(N, DCI, DAG);
23905  case ISD::INSERT_SUBVECTOR:
23906    return performInsertSubvectorCombine(N, DCI, DAG);
23907  case ISD::SELECT:
23908    return performSelectCombine(N, DCI);
23909  case ISD::VSELECT:
23910    return performVSelectCombine(N, DCI.DAG);
23911  case ISD::SETCC:
23912    return performSETCCCombine(N, DCI, DAG);
23913  case ISD::LOAD:
23914    return performLOADCombine(N, DCI, DAG, Subtarget);
23915  case ISD::STORE:
23916    return performSTORECombine(N, DCI, DAG, Subtarget);
23917  case ISD::MSTORE:
23918    return performMSTORECombine(N, DCI, DAG, Subtarget);
23919  case ISD::MGATHER:
23920  case ISD::MSCATTER:
23921    return performMaskedGatherScatterCombine(N, DCI, DAG);
23922  case ISD::VECTOR_SPLICE:
23923    return performSVESpliceCombine(N, DAG);
23924  case ISD::FP_EXTEND:
23925    return performFPExtendCombine(N, DAG, DCI, Subtarget);
23926  case AArch64ISD::BRCOND:
23927    return performBRCONDCombine(N, DCI, DAG);
23928  case AArch64ISD::TBNZ:
23929  case AArch64ISD::TBZ:
23930    return performTBZCombine(N, DCI, DAG);
23931  case AArch64ISD::CSEL:
23932    return performCSELCombine(N, DCI, DAG);
23933  case AArch64ISD::DUP:
23934  case AArch64ISD::DUPLANE8:
23935  case AArch64ISD::DUPLANE16:
23936  case AArch64ISD::DUPLANE32:
23937  case AArch64ISD::DUPLANE64:
23938    return performDUPCombine(N, DCI);
23939  case AArch64ISD::DUPLANE128:
23940    return performDupLane128Combine(N, DAG);
23941  case AArch64ISD::NVCAST:
23942    return performNVCASTCombine(N);
23943  case AArch64ISD::SPLICE:
23944    return performSpliceCombine(N, DAG);
23945  case AArch64ISD::UUNPKLO:
23946  case AArch64ISD::UUNPKHI:
23947    return performUnpackCombine(N, DAG, Subtarget);
23948  case AArch64ISD::UZP1:
23949    return performUzpCombine(N, DAG, Subtarget);
23950  case AArch64ISD::SETCC_MERGE_ZERO:
23951    return performSetccMergeZeroCombine(N, DCI);
23952  case AArch64ISD::REINTERPRET_CAST:
23953    return performReinterpretCastCombine(N);
23954  case AArch64ISD::GLD1_MERGE_ZERO:
23955  case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
23956  case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
23957  case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
23958  case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
23959  case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
23960  case AArch64ISD::GLD1_IMM_MERGE_ZERO:
23961  case AArch64ISD::GLD1S_MERGE_ZERO:
23962  case AArch64ISD::GLD1S_SCALED_MERGE_ZERO:
23963  case AArch64ISD::GLD1S_UXTW_MERGE_ZERO:
23964  case AArch64ISD::GLD1S_SXTW_MERGE_ZERO:
23965  case AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO:
23966  case AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO:
23967  case AArch64ISD::GLD1S_IMM_MERGE_ZERO:
23968    return performGLD1Combine(N, DAG);
23969  case AArch64ISD::VASHR:
23970  case AArch64ISD::VLSHR:
23971    return performVectorShiftCombine(N, *this, DCI);
23972  case AArch64ISD::SUNPKLO:
23973    return performSunpkloCombine(N, DAG);
23974  case AArch64ISD::BSP:
23975    return performBSPExpandForSVE(N, DAG, Subtarget);
23976  case ISD::INSERT_VECTOR_ELT:
23977    return performInsertVectorEltCombine(N, DCI);
23978  case ISD::EXTRACT_VECTOR_ELT:
23979    return performExtractVectorEltCombine(N, DCI, Subtarget);
23980  case ISD::VECREDUCE_ADD:
23981    return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
23982  case AArch64ISD::UADDV:
23983    return performUADDVCombine(N, DAG);
23984  case AArch64ISD::SMULL:
23985  case AArch64ISD::UMULL:
23986  case AArch64ISD::PMULL:
23987    return performMULLCombine(N, DCI, DAG);
23988  case ISD::INTRINSIC_VOID:
23989  case ISD::INTRINSIC_W_CHAIN:
23990    switch (N->getConstantOperandVal(1)) {
23991    case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
23992      return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
23993    case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
23994      return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
23995    case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
23996      return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
23997    case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
23998      return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
23999    case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
24000    case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
24001    case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
24002    case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
24003    case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
24004    case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
24005    case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
24006    case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
24007      return legalizeSVEGatherPrefetchOffsVec(N, DAG);
24008    case Intrinsic::aarch64_neon_ld2:
24009    case Intrinsic::aarch64_neon_ld3:
24010    case Intrinsic::aarch64_neon_ld4:
24011    case Intrinsic::aarch64_neon_ld1x2:
24012    case Intrinsic::aarch64_neon_ld1x3:
24013    case Intrinsic::aarch64_neon_ld1x4:
24014    case Intrinsic::aarch64_neon_ld2lane:
24015    case Intrinsic::aarch64_neon_ld3lane:
24016    case Intrinsic::aarch64_neon_ld4lane:
24017    case Intrinsic::aarch64_neon_ld2r:
24018    case Intrinsic::aarch64_neon_ld3r:
24019    case Intrinsic::aarch64_neon_ld4r:
24020    case Intrinsic::aarch64_neon_st2:
24021    case Intrinsic::aarch64_neon_st3:
24022    case Intrinsic::aarch64_neon_st4:
24023    case Intrinsic::aarch64_neon_st1x2:
24024    case Intrinsic::aarch64_neon_st1x3:
24025    case Intrinsic::aarch64_neon_st1x4:
24026    case Intrinsic::aarch64_neon_st2lane:
24027    case Intrinsic::aarch64_neon_st3lane:
24028    case Intrinsic::aarch64_neon_st4lane:
24029      return performNEONPostLDSTCombine(N, DCI, DAG);
24030    case Intrinsic::aarch64_sve_ldnt1:
24031      return performLDNT1Combine(N, DAG);
24032    case Intrinsic::aarch64_sve_ld1rq:
24033      return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG);
24034    case Intrinsic::aarch64_sve_ld1ro:
24035      return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG);
24036    case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
24037      return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
24038    case Intrinsic::aarch64_sve_ldnt1_gather:
24039      return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
24040    case Intrinsic::aarch64_sve_ldnt1_gather_index:
24041      return performGatherLoadCombine(N, DAG,
24042                                      AArch64ISD::GLDNT1_INDEX_MERGE_ZERO);
24043    case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
24044      return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
24045    case Intrinsic::aarch64_sve_ld1:
24046      return performLD1Combine(N, DAG, AArch64ISD::LD1_MERGE_ZERO);
24047    case Intrinsic::aarch64_sve_ldnf1:
24048      return performLD1Combine(N, DAG, AArch64ISD::LDNF1_MERGE_ZERO);
24049    case Intrinsic::aarch64_sve_ldff1:
24050      return performLD1Combine(N, DAG, AArch64ISD::LDFF1_MERGE_ZERO);
24051    case Intrinsic::aarch64_sve_st1:
24052      return performST1Combine(N, DAG);
24053    case Intrinsic::aarch64_sve_stnt1:
24054      return performSTNT1Combine(N, DAG);
24055    case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
24056      return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
24057    case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
24058      return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
24059    case Intrinsic::aarch64_sve_stnt1_scatter:
24060      return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
24061    case Intrinsic::aarch64_sve_stnt1_scatter_index:
24062      return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX_PRED);
24063    case Intrinsic::aarch64_sve_ld1_gather:
24064      return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO);
24065    case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
24066    case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
24067      return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1Q_MERGE_ZERO);
24068    case Intrinsic::aarch64_sve_ld1q_gather_index:
24069      return performGatherLoadCombine(N, DAG,
24070                                      AArch64ISD::GLD1Q_INDEX_MERGE_ZERO);
24071    case Intrinsic::aarch64_sve_ld1_gather_index:
24072      return performGatherLoadCombine(N, DAG,
24073                                      AArch64ISD::GLD1_SCALED_MERGE_ZERO);
24074    case Intrinsic::aarch64_sve_ld1_gather_sxtw:
24075      return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SXTW_MERGE_ZERO,
24076                                      /*OnlyPackedOffsets=*/false);
24077    case Intrinsic::aarch64_sve_ld1_gather_uxtw:
24078      return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_UXTW_MERGE_ZERO,
24079                                      /*OnlyPackedOffsets=*/false);
24080    case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
24081      return performGatherLoadCombine(N, DAG,
24082                                      AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO,
24083                                      /*OnlyPackedOffsets=*/false);
24084    case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
24085      return performGatherLoadCombine(N, DAG,
24086                                      AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO,
24087                                      /*OnlyPackedOffsets=*/false);
24088    case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
24089      return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_IMM_MERGE_ZERO);
24090    case Intrinsic::aarch64_sve_ldff1_gather:
24091      return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_MERGE_ZERO);
24092    case Intrinsic::aarch64_sve_ldff1_gather_index:
24093      return performGatherLoadCombine(N, DAG,
24094                                      AArch64ISD::GLDFF1_SCALED_MERGE_ZERO);
24095    case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
24096      return performGatherLoadCombine(N, DAG,
24097                                      AArch64ISD::GLDFF1_SXTW_MERGE_ZERO,
24098                                      /*OnlyPackedOffsets=*/false);
24099    case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
24100      return performGatherLoadCombine(N, DAG,
24101                                      AArch64ISD::GLDFF1_UXTW_MERGE_ZERO,
24102                                      /*OnlyPackedOffsets=*/false);
24103    case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
24104      return performGatherLoadCombine(N, DAG,
24105                                      AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO,
24106                                      /*OnlyPackedOffsets=*/false);
24107    case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
24108      return performGatherLoadCombine(N, DAG,
24109                                      AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO,
24110                                      /*OnlyPackedOffsets=*/false);
24111    case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
24112      return performGatherLoadCombine(N, DAG,
24113                                      AArch64ISD::GLDFF1_IMM_MERGE_ZERO);
24114    case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
24115    case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
24116      return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_PRED);
24117    case Intrinsic::aarch64_sve_st1q_scatter_index:
24118      return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_INDEX_PRED);
24119    case Intrinsic::aarch64_sve_st1_scatter:
24120      return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED);
24121    case Intrinsic::aarch64_sve_st1_scatter_index:
24122      return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SCALED_PRED);
24123    case Intrinsic::aarch64_sve_st1_scatter_sxtw:
24124      return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SXTW_PRED,
24125                                        /*OnlyPackedOffsets=*/false);
24126    case Intrinsic::aarch64_sve_st1_scatter_uxtw:
24127      return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW_PRED,
24128                                        /*OnlyPackedOffsets=*/false);
24129    case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
24130      return performScatterStoreCombine(N, DAG,
24131                                        AArch64ISD::SST1_SXTW_SCALED_PRED,
24132                                        /*OnlyPackedOffsets=*/false);
24133    case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
24134      return performScatterStoreCombine(N, DAG,
24135                                        AArch64ISD::SST1_UXTW_SCALED_PRED,
24136                                        /*OnlyPackedOffsets=*/false);
24137    case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
24138      return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED);
24139    case Intrinsic::aarch64_rndr:
24140    case Intrinsic::aarch64_rndrrs: {
24141      unsigned IntrinsicID = N->getConstantOperandVal(1);
24142      auto Register =
24143          (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
24144                                                  : AArch64SysReg::RNDRRS);
24145      SDLoc DL(N);
24146      SDValue A = DAG.getNode(
24147          AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::Glue, MVT::Other),
24148          N->getOperand(0), DAG.getConstant(Register, DL, MVT::i64));
24149      SDValue B = DAG.getNode(
24150          AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
24151          DAG.getConstant(0, DL, MVT::i32),
24152          DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1));
24153      return DAG.getMergeValues(
24154          {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
24155    }
24156    case Intrinsic::aarch64_sme_ldr_zt:
24157      return DAG.getNode(AArch64ISD::RESTORE_ZT, SDLoc(N),
24158                         DAG.getVTList(MVT::Other), N->getOperand(0),
24159                         N->getOperand(2), N->getOperand(3));
24160    case Intrinsic::aarch64_sme_str_zt:
24161      return DAG.getNode(AArch64ISD::SAVE_ZT, SDLoc(N),
24162                         DAG.getVTList(MVT::Other), N->getOperand(0),
24163                         N->getOperand(2), N->getOperand(3));
24164    default:
24165      break;
24166    }
24167    break;
24168  case ISD::GlobalAddress:
24169    return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
24170  case ISD::CTLZ:
24171    return performCTLZCombine(N, DAG, Subtarget);
24172  case ISD::SCALAR_TO_VECTOR:
24173    return performScalarToVectorCombine(N, DCI, DAG);
24174  }
24175  return SDValue();
24176}
24177
24178// Check if the return value is used as only a return value, as otherwise
24179// we can't perform a tail-call. In particular, we need to check for
24180// target ISD nodes that are returns and any other "odd" constructs
24181// that the generic analysis code won't necessarily catch.
24182bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
24183                                               SDValue &Chain) const {
24184  if (N->getNumValues() != 1)
24185    return false;
24186  if (!N->hasNUsesOfValue(1, 0))
24187    return false;
24188
24189  SDValue TCChain = Chain;
24190  SDNode *Copy = *N->use_begin();
24191  if (Copy->getOpcode() == ISD::CopyToReg) {
24192    // If the copy has a glue operand, we conservatively assume it isn't safe to
24193    // perform a tail call.
24194    if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
24195        MVT::Glue)
24196      return false;
24197    TCChain = Copy->getOperand(0);
24198  } else if (Copy->getOpcode() != ISD::FP_EXTEND)
24199    return false;
24200
24201  bool HasRet = false;
24202  for (SDNode *Node : Copy->uses()) {
24203    if (Node->getOpcode() != AArch64ISD::RET_GLUE)
24204      return false;
24205    HasRet = true;
24206  }
24207
24208  if (!HasRet)
24209    return false;
24210
24211  Chain = TCChain;
24212  return true;
24213}
24214
24215// Return whether the an instruction can potentially be optimized to a tail
24216// call. This will cause the optimizers to attempt to move, or duplicate,
24217// return instructions to help enable tail call optimizations for this
24218// instruction.
24219bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
24220  return CI->isTailCall();
24221}
24222
24223bool AArch64TargetLowering::isIndexingLegal(MachineInstr &MI, Register Base,
24224                                            Register Offset, bool IsPre,
24225                                            MachineRegisterInfo &MRI) const {
24226  auto CstOffset = getIConstantVRegVal(Offset, MRI);
24227  if (!CstOffset || CstOffset->isZero())
24228    return false;
24229
24230  // All of the indexed addressing mode instructions take a signed 9 bit
24231  // immediate offset. Our CstOffset is a G_PTR_ADD offset so it already
24232  // encodes the sign/indexing direction.
24233  return isInt<9>(CstOffset->getSExtValue());
24234}
24235
24236bool AArch64TargetLowering::getIndexedAddressParts(SDNode *N, SDNode *Op,
24237                                                   SDValue &Base,
24238                                                   SDValue &Offset,
24239                                                   SelectionDAG &DAG) const {
24240  if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
24241    return false;
24242
24243  // Non-null if there is exactly one user of the loaded value (ignoring chain).
24244  SDNode *ValOnlyUser = nullptr;
24245  for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE;
24246       ++UI) {
24247    if (UI.getUse().getResNo() == 1)
24248      continue; // Ignore chain.
24249    if (ValOnlyUser == nullptr)
24250      ValOnlyUser = *UI;
24251    else {
24252      ValOnlyUser = nullptr; // Multiple non-chain uses, bail out.
24253      break;
24254    }
24255  }
24256
24257  auto IsUndefOrZero = [](SDValue V) {
24258    return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true);
24259  };
24260
24261  // If the only user of the value is a scalable vector splat, it is
24262  // preferable to do a replicating load (ld1r*).
24263  if (ValOnlyUser && ValOnlyUser->getValueType(0).isScalableVector() &&
24264      (ValOnlyUser->getOpcode() == ISD::SPLAT_VECTOR ||
24265       (ValOnlyUser->getOpcode() == AArch64ISD::DUP_MERGE_PASSTHRU &&
24266        IsUndefOrZero(ValOnlyUser->getOperand(2)))))
24267    return false;
24268
24269  Base = Op->getOperand(0);
24270  // All of the indexed addressing mode instructions take a signed
24271  // 9 bit immediate offset.
24272  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
24273    int64_t RHSC = RHS->getSExtValue();
24274    if (Op->getOpcode() == ISD::SUB)
24275      RHSC = -(uint64_t)RHSC;
24276    if (!isInt<9>(RHSC))
24277      return false;
24278    // Always emit pre-inc/post-inc addressing mode. Use negated constant offset
24279    // when dealing with subtraction.
24280    Offset = DAG.getConstant(RHSC, SDLoc(N), RHS->getValueType(0));
24281    return true;
24282  }
24283  return false;
24284}
24285
24286bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
24287                                                      SDValue &Offset,
24288                                                      ISD::MemIndexedMode &AM,
24289                                                      SelectionDAG &DAG) const {
24290  EVT VT;
24291  SDValue Ptr;
24292  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
24293    VT = LD->getMemoryVT();
24294    Ptr = LD->getBasePtr();
24295  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
24296    VT = ST->getMemoryVT();
24297    Ptr = ST->getBasePtr();
24298  } else
24299    return false;
24300
24301  if (!getIndexedAddressParts(N, Ptr.getNode(), Base, Offset, DAG))
24302    return false;
24303  AM = ISD::PRE_INC;
24304  return true;
24305}
24306
24307bool AArch64TargetLowering::getPostIndexedAddressParts(
24308    SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset,
24309    ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
24310  EVT VT;
24311  SDValue Ptr;
24312  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
24313    VT = LD->getMemoryVT();
24314    Ptr = LD->getBasePtr();
24315  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
24316    VT = ST->getMemoryVT();
24317    Ptr = ST->getBasePtr();
24318  } else
24319    return false;
24320
24321  if (!getIndexedAddressParts(N, Op, Base, Offset, DAG))
24322    return false;
24323  // Post-indexing updates the base, so it's not a valid transform
24324  // if that's not the same as the load's pointer.
24325  if (Ptr != Base)
24326    return false;
24327  AM = ISD::POST_INC;
24328  return true;
24329}
24330
24331static void replaceBoolVectorBitcast(SDNode *N,
24332                                     SmallVectorImpl<SDValue> &Results,
24333                                     SelectionDAG &DAG) {
24334  SDLoc DL(N);
24335  SDValue Op = N->getOperand(0);
24336  EVT VT = N->getValueType(0);
24337  [[maybe_unused]] EVT SrcVT = Op.getValueType();
24338  assert(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
24339         "Must be bool vector.");
24340
24341  // Special handling for Clang's __builtin_convertvector. For vectors with <8
24342  // elements, it adds a vector concatenation with undef(s). If we encounter
24343  // this here, we can skip the concat.
24344  if (Op.getOpcode() == ISD::CONCAT_VECTORS && !Op.getOperand(0).isUndef()) {
24345    bool AllUndef = true;
24346    for (unsigned I = 1; I < Op.getNumOperands(); ++I)
24347      AllUndef &= Op.getOperand(I).isUndef();
24348
24349    if (AllUndef)
24350      Op = Op.getOperand(0);
24351  }
24352
24353  SDValue VectorBits = vectorToScalarBitmask(Op.getNode(), DAG);
24354  if (VectorBits)
24355    Results.push_back(DAG.getZExtOrTrunc(VectorBits, DL, VT));
24356}
24357
24358static void CustomNonLegalBITCASTResults(SDNode *N,
24359                                         SmallVectorImpl<SDValue> &Results,
24360                                         SelectionDAG &DAG, EVT ExtendVT,
24361                                         EVT CastVT) {
24362  SDLoc DL(N);
24363  SDValue Op = N->getOperand(0);
24364  EVT VT = N->getValueType(0);
24365
24366  // Use SCALAR_TO_VECTOR for lane zero
24367  SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtendVT, Op);
24368  SDValue CastVal = DAG.getNode(ISD::BITCAST, DL, CastVT, Vec);
24369  SDValue IdxZero = DAG.getVectorIdxConstant(0, DL);
24370  Results.push_back(
24371      DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, CastVal, IdxZero));
24372}
24373
24374void AArch64TargetLowering::ReplaceBITCASTResults(
24375    SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
24376  SDLoc DL(N);
24377  SDValue Op = N->getOperand(0);
24378  EVT VT = N->getValueType(0);
24379  EVT SrcVT = Op.getValueType();
24380
24381  if (VT == MVT::v2i16 && SrcVT == MVT::i32) {
24382    CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v4i16);
24383    return;
24384  }
24385
24386  if (VT == MVT::v4i8 && SrcVT == MVT::i32) {
24387    CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v8i8);
24388    return;
24389  }
24390
24391  if (VT == MVT::v2i8 && SrcVT == MVT::i16) {
24392    CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v4i16, MVT::v8i8);
24393    return;
24394  }
24395
24396  if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {
24397    assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
24398           "Expected fp->int bitcast!");
24399
24400    // Bitcasting between unpacked vector types of different element counts is
24401    // not a NOP because the live elements are laid out differently.
24402    //                01234567
24403    // e.g. nxv2i32 = XX??XX??
24404    //      nxv4f16 = X?X?X?X?
24405    if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
24406      return;
24407
24408    SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG);
24409    Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult));
24410    return;
24411  }
24412
24413  if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
24414      !VT.isVector())
24415    return replaceBoolVectorBitcast(N, Results, DAG);
24416
24417  if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
24418    return;
24419
24420  Op = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
24421                                 DAG.getUNDEF(MVT::i32), Op);
24422  Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
24423  Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
24424}
24425
24426static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl<SDValue> &Results,
24427                               SelectionDAG &DAG,
24428                               const AArch64Subtarget *Subtarget) {
24429  EVT VT = N->getValueType(0);
24430  if (!VT.is256BitVector() ||
24431      (VT.getScalarType().isFloatingPoint() &&
24432       !N->getFlags().hasAllowReassociation()) ||
24433      (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16()))
24434    return;
24435
24436  SDValue X = N->getOperand(0);
24437  auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(1));
24438  if (!Shuf) {
24439    Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0));
24440    X = N->getOperand(1);
24441    if (!Shuf)
24442      return;
24443  }
24444
24445  if (Shuf->getOperand(0) != X || !Shuf->getOperand(1)->isUndef())
24446    return;
24447
24448  // Check the mask is 1,0,3,2,5,4,...
24449  ArrayRef<int> Mask = Shuf->getMask();
24450  for (int I = 0, E = Mask.size(); I < E; I++)
24451    if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1))
24452      return;
24453
24454  SDLoc DL(N);
24455  auto LoHi = DAG.SplitVector(X, DL);
24456  assert(LoHi.first.getValueType() == LoHi.second.getValueType());
24457  SDValue Addp = DAG.getNode(AArch64ISD::ADDP, N, LoHi.first.getValueType(),
24458                             LoHi.first, LoHi.second);
24459
24460  // Shuffle the elements back into order.
24461  SmallVector<int> NMask;
24462  for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) {
24463    NMask.push_back(I);
24464    NMask.push_back(I);
24465  }
24466  Results.push_back(
24467      DAG.getVectorShuffle(VT, DL,
24468                           DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Addp,
24469                                       DAG.getUNDEF(LoHi.first.getValueType())),
24470                           DAG.getUNDEF(VT), NMask));
24471}
24472
24473static void ReplaceReductionResults(SDNode *N,
24474                                    SmallVectorImpl<SDValue> &Results,
24475                                    SelectionDAG &DAG, unsigned InterOp,
24476                                    unsigned AcrossOp) {
24477  EVT LoVT, HiVT;
24478  SDValue Lo, Hi;
24479  SDLoc dl(N);
24480  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
24481  std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
24482  SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
24483  SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
24484  Results.push_back(SplitVal);
24485}
24486
24487void AArch64TargetLowering::ReplaceExtractSubVectorResults(
24488    SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
24489  SDValue In = N->getOperand(0);
24490  EVT InVT = In.getValueType();
24491
24492  // Common code will handle these just fine.
24493  if (!InVT.isScalableVector() || !InVT.isInteger())
24494    return;
24495
24496  SDLoc DL(N);
24497  EVT VT = N->getValueType(0);
24498
24499  // The following checks bail if this is not a halving operation.
24500
24501  ElementCount ResEC = VT.getVectorElementCount();
24502
24503  if (InVT.getVectorElementCount() != (ResEC * 2))
24504    return;
24505
24506  auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
24507  if (!CIndex)
24508    return;
24509
24510  unsigned Index = CIndex->getZExtValue();
24511  if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
24512    return;
24513
24514  unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
24515  EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());
24516
24517  SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
24518  Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
24519}
24520
24521// Create an even/odd pair of X registers holding integer value V.
24522static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
24523  SDLoc dl(V.getNode());
24524  auto [VLo, VHi] = DAG.SplitScalar(V, dl, MVT::i64, MVT::i64);
24525  if (DAG.getDataLayout().isBigEndian())
24526    std::swap (VLo, VHi);
24527  SDValue RegClass =
24528      DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32);
24529  SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32);
24530  SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32);
24531  const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
24532  return SDValue(
24533      DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
24534}
24535
24536static void ReplaceCMP_SWAP_128Results(SDNode *N,
24537                                       SmallVectorImpl<SDValue> &Results,
24538                                       SelectionDAG &DAG,
24539                                       const AArch64Subtarget *Subtarget) {
24540  assert(N->getValueType(0) == MVT::i128 &&
24541         "AtomicCmpSwap on types less than 128 should be legal");
24542
24543  MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
24544  if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
24545    // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
24546    // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
24547    SDValue Ops[] = {
24548        createGPRPairNode(DAG, N->getOperand(2)), // Compare value
24549        createGPRPairNode(DAG, N->getOperand(3)), // Store value
24550        N->getOperand(1), // Ptr
24551        N->getOperand(0), // Chain in
24552    };
24553
24554    unsigned Opcode;
24555    switch (MemOp->getMergedOrdering()) {
24556    case AtomicOrdering::Monotonic:
24557      Opcode = AArch64::CASPX;
24558      break;
24559    case AtomicOrdering::Acquire:
24560      Opcode = AArch64::CASPAX;
24561      break;
24562    case AtomicOrdering::Release:
24563      Opcode = AArch64::CASPLX;
24564      break;
24565    case AtomicOrdering::AcquireRelease:
24566    case AtomicOrdering::SequentiallyConsistent:
24567      Opcode = AArch64::CASPALX;
24568      break;
24569    default:
24570      llvm_unreachable("Unexpected ordering!");
24571    }
24572
24573    MachineSDNode *CmpSwap = DAG.getMachineNode(
24574        Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
24575    DAG.setNodeMemRefs(CmpSwap, {MemOp});
24576
24577    unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
24578    if (DAG.getDataLayout().isBigEndian())
24579      std::swap(SubReg1, SubReg2);
24580    SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
24581                                            SDValue(CmpSwap, 0));
24582    SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
24583                                            SDValue(CmpSwap, 0));
24584    Results.push_back(
24585        DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
24586    Results.push_back(SDValue(CmpSwap, 1)); // Chain out
24587    return;
24588  }
24589
24590  unsigned Opcode;
24591  switch (MemOp->getMergedOrdering()) {
24592  case AtomicOrdering::Monotonic:
24593    Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
24594    break;
24595  case AtomicOrdering::Acquire:
24596    Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
24597    break;
24598  case AtomicOrdering::Release:
24599    Opcode = AArch64::CMP_SWAP_128_RELEASE;
24600    break;
24601  case AtomicOrdering::AcquireRelease:
24602  case AtomicOrdering::SequentiallyConsistent:
24603    Opcode = AArch64::CMP_SWAP_128;
24604    break;
24605  default:
24606    llvm_unreachable("Unexpected ordering!");
24607  }
24608
24609  SDLoc DL(N);
24610  auto Desired = DAG.SplitScalar(N->getOperand(2), DL, MVT::i64, MVT::i64);
24611  auto New = DAG.SplitScalar(N->getOperand(3), DL, MVT::i64, MVT::i64);
24612  SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
24613                   New.first,        New.second,    N->getOperand(0)};
24614  SDNode *CmpSwap = DAG.getMachineNode(
24615      Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other),
24616      Ops);
24617  DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
24618
24619  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
24620                                SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
24621  Results.push_back(SDValue(CmpSwap, 3));
24622}
24623
24624static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode,
24625                                       AtomicOrdering Ordering) {
24626  // ATOMIC_LOAD_CLR only appears when lowering ATOMIC_LOAD_AND (see
24627  // LowerATOMIC_LOAD_AND). We can't take that approach with 128-bit, because
24628  // the type is not legal. Therefore we shouldn't expect to see a 128-bit
24629  // ATOMIC_LOAD_CLR at any point.
24630  assert(ISDOpcode != ISD::ATOMIC_LOAD_CLR &&
24631         "ATOMIC_LOAD_AND should be lowered to LDCLRP directly");
24632  assert(ISDOpcode != ISD::ATOMIC_LOAD_ADD && "There is no 128 bit LDADD");
24633  assert(ISDOpcode != ISD::ATOMIC_LOAD_SUB && "There is no 128 bit LDSUB");
24634
24635  if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
24636    // The operand will need to be XORed in a separate step.
24637    switch (Ordering) {
24638    case AtomicOrdering::Monotonic:
24639      return AArch64::LDCLRP;
24640      break;
24641    case AtomicOrdering::Acquire:
24642      return AArch64::LDCLRPA;
24643      break;
24644    case AtomicOrdering::Release:
24645      return AArch64::LDCLRPL;
24646      break;
24647    case AtomicOrdering::AcquireRelease:
24648    case AtomicOrdering::SequentiallyConsistent:
24649      return AArch64::LDCLRPAL;
24650      break;
24651    default:
24652      llvm_unreachable("Unexpected ordering!");
24653    }
24654  }
24655
24656  if (ISDOpcode == ISD::ATOMIC_LOAD_OR) {
24657    switch (Ordering) {
24658    case AtomicOrdering::Monotonic:
24659      return AArch64::LDSETP;
24660      break;
24661    case AtomicOrdering::Acquire:
24662      return AArch64::LDSETPA;
24663      break;
24664    case AtomicOrdering::Release:
24665      return AArch64::LDSETPL;
24666      break;
24667    case AtomicOrdering::AcquireRelease:
24668    case AtomicOrdering::SequentiallyConsistent:
24669      return AArch64::LDSETPAL;
24670      break;
24671    default:
24672      llvm_unreachable("Unexpected ordering!");
24673    }
24674  }
24675
24676  if (ISDOpcode == ISD::ATOMIC_SWAP) {
24677    switch (Ordering) {
24678    case AtomicOrdering::Monotonic:
24679      return AArch64::SWPP;
24680      break;
24681    case AtomicOrdering::Acquire:
24682      return AArch64::SWPPA;
24683      break;
24684    case AtomicOrdering::Release:
24685      return AArch64::SWPPL;
24686      break;
24687    case AtomicOrdering::AcquireRelease:
24688    case AtomicOrdering::SequentiallyConsistent:
24689      return AArch64::SWPPAL;
24690      break;
24691    default:
24692      llvm_unreachable("Unexpected ordering!");
24693    }
24694  }
24695
24696  llvm_unreachable("Unexpected ISDOpcode!");
24697}
24698
24699static void ReplaceATOMIC_LOAD_128Results(SDNode *N,
24700                                          SmallVectorImpl<SDValue> &Results,
24701                                          SelectionDAG &DAG,
24702                                          const AArch64Subtarget *Subtarget) {
24703  // LSE128 has a 128-bit RMW ops, but i128 is not a legal type, so lower it
24704  // here. This follows the approach of the CMP_SWAP_XXX pseudo instructions
24705  // rather than the CASP instructions, because CASP has register classes for
24706  // the pairs of registers and therefore uses REG_SEQUENCE and EXTRACT_SUBREG
24707  // to present them as single operands. LSE128 instructions use the GPR64
24708  // register class (because the pair does not have to be sequential), like
24709  // CMP_SWAP_XXX, and therefore we use TRUNCATE and BUILD_PAIR.
24710
24711  assert(N->getValueType(0) == MVT::i128 &&
24712         "AtomicLoadXXX on types less than 128 should be legal");
24713
24714  if (!Subtarget->hasLSE128())
24715    return;
24716
24717  MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
24718  const SDValue &Chain = N->getOperand(0);
24719  const SDValue &Ptr = N->getOperand(1);
24720  const SDValue &Val128 = N->getOperand(2);
24721  std::pair<SDValue, SDValue> Val2x64 =
24722      DAG.SplitScalar(Val128, SDLoc(Val128), MVT::i64, MVT::i64);
24723
24724  const unsigned ISDOpcode = N->getOpcode();
24725  const unsigned MachineOpcode =
24726      getAtomicLoad128Opcode(ISDOpcode, MemOp->getMergedOrdering());
24727
24728  if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
24729    SDLoc dl(Val128);
24730    Val2x64.first =
24731        DAG.getNode(ISD::XOR, dl, MVT::i64,
24732                    DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.first);
24733    Val2x64.second =
24734        DAG.getNode(ISD::XOR, dl, MVT::i64,
24735                    DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.second);
24736  }
24737
24738  SDValue Ops[] = {Val2x64.first, Val2x64.second, Ptr, Chain};
24739  if (DAG.getDataLayout().isBigEndian())
24740    std::swap(Ops[0], Ops[1]);
24741
24742  MachineSDNode *AtomicInst =
24743      DAG.getMachineNode(MachineOpcode, SDLoc(N),
24744                         DAG.getVTList(MVT::i64, MVT::i64, MVT::Other), Ops);
24745
24746  DAG.setNodeMemRefs(AtomicInst, {MemOp});
24747
24748  SDValue Lo = SDValue(AtomicInst, 0), Hi = SDValue(AtomicInst, 1);
24749  if (DAG.getDataLayout().isBigEndian())
24750    std::swap(Lo, Hi);
24751
24752  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
24753  Results.push_back(SDValue(AtomicInst, 2)); // Chain out
24754}
24755
24756void AArch64TargetLowering::ReplaceNodeResults(
24757    SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
24758  switch (N->getOpcode()) {
24759  default:
24760    llvm_unreachable("Don't know how to custom expand this");
24761  case ISD::BITCAST:
24762    ReplaceBITCASTResults(N, Results, DAG);
24763    return;
24764  case ISD::VECREDUCE_ADD:
24765  case ISD::VECREDUCE_SMAX:
24766  case ISD::VECREDUCE_SMIN:
24767  case ISD::VECREDUCE_UMAX:
24768  case ISD::VECREDUCE_UMIN:
24769    Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
24770    return;
24771  case ISD::ADD:
24772  case ISD::FADD:
24773    ReplaceAddWithADDP(N, Results, DAG, Subtarget);
24774    return;
24775
24776  case ISD::CTPOP:
24777  case ISD::PARITY:
24778    if (SDValue Result = LowerCTPOP_PARITY(SDValue(N, 0), DAG))
24779      Results.push_back(Result);
24780    return;
24781  case AArch64ISD::SADDV:
24782    ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
24783    return;
24784  case AArch64ISD::UADDV:
24785    ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV);
24786    return;
24787  case AArch64ISD::SMINV:
24788    ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV);
24789    return;
24790  case AArch64ISD::UMINV:
24791    ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV);
24792    return;
24793  case AArch64ISD::SMAXV:
24794    ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV);
24795    return;
24796  case AArch64ISD::UMAXV:
24797    ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV);
24798    return;
24799  case ISD::MULHS:
24800    if (useSVEForFixedLengthVectorVT(SDValue(N, 0).getValueType()))
24801      Results.push_back(
24802          LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHS_PRED));
24803    return;
24804  case ISD::MULHU:
24805    if (useSVEForFixedLengthVectorVT(SDValue(N, 0).getValueType()))
24806      Results.push_back(
24807          LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHU_PRED));
24808    return;
24809  case ISD::FP_TO_UINT:
24810  case ISD::FP_TO_SINT:
24811  case ISD::STRICT_FP_TO_SINT:
24812  case ISD::STRICT_FP_TO_UINT:
24813    assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
24814    // Let normal code take care of it by not adding anything to Results.
24815    return;
24816  case ISD::ATOMIC_CMP_SWAP:
24817    ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
24818    return;
24819  case ISD::ATOMIC_LOAD_CLR:
24820    assert(N->getValueType(0) != MVT::i128 &&
24821           "128-bit ATOMIC_LOAD_AND should be lowered directly to LDCLRP");
24822    break;
24823  case ISD::ATOMIC_LOAD_AND:
24824  case ISD::ATOMIC_LOAD_OR:
24825  case ISD::ATOMIC_SWAP: {
24826    assert(cast<AtomicSDNode>(N)->getVal().getValueType() == MVT::i128 &&
24827           "Expected 128-bit atomicrmw.");
24828    // These need custom type legalisation so we go directly to instruction.
24829    ReplaceATOMIC_LOAD_128Results(N, Results, DAG, Subtarget);
24830    return;
24831  }
24832  case ISD::ATOMIC_LOAD:
24833  case ISD::LOAD: {
24834    MemSDNode *LoadNode = cast<MemSDNode>(N);
24835    EVT MemVT = LoadNode->getMemoryVT();
24836    // Handle lowering 256 bit non temporal loads into LDNP for little-endian
24837    // targets.
24838    if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() &&
24839        MemVT.getSizeInBits() == 256u &&
24840        (MemVT.getScalarSizeInBits() == 8u ||
24841         MemVT.getScalarSizeInBits() == 16u ||
24842         MemVT.getScalarSizeInBits() == 32u ||
24843         MemVT.getScalarSizeInBits() == 64u)) {
24844
24845      SDValue Result = DAG.getMemIntrinsicNode(
24846          AArch64ISD::LDNP, SDLoc(N),
24847          DAG.getVTList({MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
24848                         MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
24849                         MVT::Other}),
24850          {LoadNode->getChain(), LoadNode->getBasePtr()},
24851          LoadNode->getMemoryVT(), LoadNode->getMemOperand());
24852
24853      SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT,
24854                                 Result.getValue(0), Result.getValue(1));
24855      Results.append({Pair, Result.getValue(2) /* Chain */});
24856      return;
24857    }
24858
24859    if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
24860        LoadNode->getMemoryVT() != MVT::i128) {
24861      // Non-volatile or atomic loads are optimized later in AArch64's load/store
24862      // optimizer.
24863      return;
24864    }
24865
24866    if (SDValue(N, 0).getValueType() == MVT::i128) {
24867      auto *AN = dyn_cast<AtomicSDNode>(LoadNode);
24868      bool isLoadAcquire =
24869          AN && AN->getSuccessOrdering() == AtomicOrdering::Acquire;
24870      unsigned Opcode = isLoadAcquire ? AArch64ISD::LDIAPP : AArch64ISD::LDP;
24871
24872      if (isLoadAcquire)
24873        assert(Subtarget->hasFeature(AArch64::FeatureRCPC3));
24874
24875      SDValue Result = DAG.getMemIntrinsicNode(
24876          Opcode, SDLoc(N), DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
24877          {LoadNode->getChain(), LoadNode->getBasePtr()},
24878          LoadNode->getMemoryVT(), LoadNode->getMemOperand());
24879
24880      unsigned FirstRes = DAG.getDataLayout().isBigEndian() ? 1 : 0;
24881
24882      SDValue Pair =
24883          DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
24884                      Result.getValue(FirstRes), Result.getValue(1 - FirstRes));
24885      Results.append({Pair, Result.getValue(2) /* Chain */});
24886    }
24887    return;
24888  }
24889  case ISD::EXTRACT_SUBVECTOR:
24890    ReplaceExtractSubVectorResults(N, Results, DAG);
24891    return;
24892  case ISD::INSERT_SUBVECTOR:
24893  case ISD::CONCAT_VECTORS:
24894    // Custom lowering has been requested for INSERT_SUBVECTOR and
24895    // CONCAT_VECTORS -- but delegate to common code for result type
24896    // legalisation
24897    return;
24898  case ISD::INTRINSIC_WO_CHAIN: {
24899    EVT VT = N->getValueType(0);
24900    assert((VT == MVT::i8 || VT == MVT::i16) &&
24901           "custom lowering for unexpected type");
24902
24903    Intrinsic::ID IntID =
24904        static_cast<Intrinsic::ID>(N->getConstantOperandVal(0));
24905    switch (IntID) {
24906    default:
24907      return;
24908    case Intrinsic::aarch64_sve_clasta_n: {
24909      SDLoc DL(N);
24910      auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
24911      auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
24912                           N->getOperand(1), Op2, N->getOperand(3));
24913      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
24914      return;
24915    }
24916    case Intrinsic::aarch64_sve_clastb_n: {
24917      SDLoc DL(N);
24918      auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
24919      auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
24920                           N->getOperand(1), Op2, N->getOperand(3));
24921      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
24922      return;
24923    }
24924    case Intrinsic::aarch64_sve_lasta: {
24925      SDLoc DL(N);
24926      auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
24927                           N->getOperand(1), N->getOperand(2));
24928      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
24929      return;
24930    }
24931    case Intrinsic::aarch64_sve_lastb: {
24932      SDLoc DL(N);
24933      auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
24934                           N->getOperand(1), N->getOperand(2));
24935      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
24936      return;
24937    }
24938    }
24939  }
24940  case ISD::READ_REGISTER: {
24941    SDLoc DL(N);
24942    assert(N->getValueType(0) == MVT::i128 &&
24943           "READ_REGISTER custom lowering is only for 128-bit sysregs");
24944    SDValue Chain = N->getOperand(0);
24945    SDValue SysRegName = N->getOperand(1);
24946
24947    SDValue Result = DAG.getNode(
24948        AArch64ISD::MRRS, DL, DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
24949        Chain, SysRegName);
24950
24951    // Sysregs are not endian. Result.getValue(0) always contains the lower half
24952    // of the 128-bit System Register value.
24953    SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
24954                               Result.getValue(0), Result.getValue(1));
24955    Results.push_back(Pair);
24956    Results.push_back(Result.getValue(2)); // Chain
24957    return;
24958  }
24959  }
24960}
24961
24962bool AArch64TargetLowering::useLoadStackGuardNode() const {
24963  if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
24964    return TargetLowering::useLoadStackGuardNode();
24965  return true;
24966}
24967
24968unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
24969  // Combine multiple FDIVs with the same divisor into multiple FMULs by the
24970  // reciprocal if there are three or more FDIVs.
24971  return 3;
24972}
24973
24974TargetLoweringBase::LegalizeTypeAction
24975AArch64TargetLowering::getPreferredVectorAction(MVT VT) const {
24976  // During type legalization, we prefer to widen v1i8, v1i16, v1i32  to v8i8,
24977  // v4i16, v2i32 instead of to promote.
24978  if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
24979      VT == MVT::v1f32)
24980    return TypeWidenVector;
24981
24982  return TargetLoweringBase::getPreferredVectorAction(VT);
24983}
24984
24985// In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
24986// provided the address is 16-byte aligned.
24987bool AArch64TargetLowering::isOpSuitableForLDPSTP(const Instruction *I) const {
24988  if (!Subtarget->hasLSE2())
24989    return false;
24990
24991  if (auto LI = dyn_cast<LoadInst>(I))
24992    return LI->getType()->getPrimitiveSizeInBits() == 128 &&
24993           LI->getAlign() >= Align(16);
24994
24995  if (auto SI = dyn_cast<StoreInst>(I))
24996    return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
24997           SI->getAlign() >= Align(16);
24998
24999  return false;
25000}
25001
25002bool AArch64TargetLowering::isOpSuitableForLSE128(const Instruction *I) const {
25003  if (!Subtarget->hasLSE128())
25004    return false;
25005
25006  // Only use SWPP for stores where LSE2 would require a fence. Unlike STP, SWPP
25007  // will clobber the two registers.
25008  if (const auto *SI = dyn_cast<StoreInst>(I))
25009    return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25010           SI->getAlign() >= Align(16) &&
25011           (SI->getOrdering() == AtomicOrdering::Release ||
25012            SI->getOrdering() == AtomicOrdering::SequentiallyConsistent);
25013
25014  if (const auto *RMW = dyn_cast<AtomicRMWInst>(I))
25015    return RMW->getValOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25016           RMW->getAlign() >= Align(16) &&
25017           (RMW->getOperation() == AtomicRMWInst::Xchg ||
25018            RMW->getOperation() == AtomicRMWInst::And ||
25019            RMW->getOperation() == AtomicRMWInst::Or);
25020
25021  return false;
25022}
25023
25024bool AArch64TargetLowering::isOpSuitableForRCPC3(const Instruction *I) const {
25025  if (!Subtarget->hasLSE2() || !Subtarget->hasRCPC3())
25026    return false;
25027
25028  if (auto LI = dyn_cast<LoadInst>(I))
25029    return LI->getType()->getPrimitiveSizeInBits() == 128 &&
25030           LI->getAlign() >= Align(16) &&
25031           LI->getOrdering() == AtomicOrdering::Acquire;
25032
25033  if (auto SI = dyn_cast<StoreInst>(I))
25034    return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25035           SI->getAlign() >= Align(16) &&
25036           SI->getOrdering() == AtomicOrdering::Release;
25037
25038  return false;
25039}
25040
25041bool AArch64TargetLowering::shouldInsertFencesForAtomic(
25042    const Instruction *I) const {
25043  if (isOpSuitableForRCPC3(I))
25044    return false;
25045  if (isOpSuitableForLSE128(I))
25046    return false;
25047  if (isOpSuitableForLDPSTP(I))
25048    return true;
25049  return false;
25050}
25051
25052bool AArch64TargetLowering::shouldInsertTrailingFenceForAtomicStore(
25053    const Instruction *I) const {
25054  // Store-Release instructions only provide seq_cst guarantees when paired with
25055  // Load-Acquire instructions. MSVC CRT does not use these instructions to
25056  // implement seq_cst loads and stores, so we need additional explicit fences
25057  // after memory writes.
25058  if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
25059    return false;
25060
25061  switch (I->getOpcode()) {
25062  default:
25063    return false;
25064  case Instruction::AtomicCmpXchg:
25065    return cast<AtomicCmpXchgInst>(I)->getSuccessOrdering() ==
25066           AtomicOrdering::SequentiallyConsistent;
25067  case Instruction::AtomicRMW:
25068    return cast<AtomicRMWInst>(I)->getOrdering() ==
25069           AtomicOrdering::SequentiallyConsistent;
25070  case Instruction::Store:
25071    return cast<StoreInst>(I)->getOrdering() ==
25072           AtomicOrdering::SequentiallyConsistent;
25073  }
25074}
25075
25076// Loads and stores less than 128-bits are already atomic; ones above that
25077// are doomed anyway, so defer to the default libcall and blame the OS when
25078// things go wrong.
25079TargetLoweringBase::AtomicExpansionKind
25080AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
25081  unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
25082  if (Size != 128)
25083    return AtomicExpansionKind::None;
25084  if (isOpSuitableForRCPC3(SI))
25085    return AtomicExpansionKind::None;
25086  if (isOpSuitableForLSE128(SI))
25087    return AtomicExpansionKind::Expand;
25088  if (isOpSuitableForLDPSTP(SI))
25089    return AtomicExpansionKind::None;
25090  return AtomicExpansionKind::Expand;
25091}
25092
25093// Loads and stores less than 128-bits are already atomic; ones above that
25094// are doomed anyway, so defer to the default libcall and blame the OS when
25095// things go wrong.
25096TargetLowering::AtomicExpansionKind
25097AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
25098  unsigned Size = LI->getType()->getPrimitiveSizeInBits();
25099
25100  if (Size != 128)
25101    return AtomicExpansionKind::None;
25102  if (isOpSuitableForRCPC3(LI))
25103    return AtomicExpansionKind::None;
25104  // No LSE128 loads
25105  if (isOpSuitableForLDPSTP(LI))
25106    return AtomicExpansionKind::None;
25107
25108  // At -O0, fast-regalloc cannot cope with the live vregs necessary to
25109  // implement atomicrmw without spilling. If the target address is also on the
25110  // stack and close enough to the spill slot, this can lead to a situation
25111  // where the monitor always gets cleared and the atomic operation can never
25112  // succeed. So at -O0 lower this operation to a CAS loop.
25113  if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
25114    return AtomicExpansionKind::CmpXChg;
25115
25116  // Using CAS for an atomic load has a better chance of succeeding under high
25117  // contention situations. So use it if available.
25118  return Subtarget->hasLSE() ? AtomicExpansionKind::CmpXChg
25119                             : AtomicExpansionKind::LLSC;
25120}
25121
25122// The "default" for integer RMW operations is to expand to an LL/SC loop.
25123// However, with the LSE instructions (or outline-atomics mode, which provides
25124// library routines in place of the LSE-instructions), we can directly emit many
25125// operations instead.
25126//
25127// Floating-point operations are always emitted to a cmpxchg loop, because they
25128// may trigger a trap which aborts an LLSC sequence.
25129TargetLowering::AtomicExpansionKind
25130AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
25131  unsigned Size = AI->getType()->getPrimitiveSizeInBits();
25132  assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes.");
25133
25134  if (AI->isFloatingPointOperation())
25135    return AtomicExpansionKind::CmpXChg;
25136
25137  bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 &&
25138                      (AI->getOperation() == AtomicRMWInst::Xchg ||
25139                       AI->getOperation() == AtomicRMWInst::Or ||
25140                       AI->getOperation() == AtomicRMWInst::And);
25141  if (CanUseLSE128)
25142    return AtomicExpansionKind::None;
25143
25144  // Nand is not supported in LSE.
25145  // Leave 128 bits to LLSC or CmpXChg.
25146  if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) {
25147    if (Subtarget->hasLSE())
25148      return AtomicExpansionKind::None;
25149    if (Subtarget->outlineAtomics()) {
25150      // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
25151      // Don't outline them unless
25152      // (1) high level <atomic> support approved:
25153      //   http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
25154      // (2) low level libgcc and compiler-rt support implemented by:
25155      //   min/max outline atomics helpers
25156      if (AI->getOperation() != AtomicRMWInst::Min &&
25157          AI->getOperation() != AtomicRMWInst::Max &&
25158          AI->getOperation() != AtomicRMWInst::UMin &&
25159          AI->getOperation() != AtomicRMWInst::UMax) {
25160        return AtomicExpansionKind::None;
25161      }
25162    }
25163  }
25164
25165  // At -O0, fast-regalloc cannot cope with the live vregs necessary to
25166  // implement atomicrmw without spilling. If the target address is also on the
25167  // stack and close enough to the spill slot, this can lead to a situation
25168  // where the monitor always gets cleared and the atomic operation can never
25169  // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if
25170  // we have a single CAS instruction that can replace the loop.
25171  if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None ||
25172      Subtarget->hasLSE())
25173    return AtomicExpansionKind::CmpXChg;
25174
25175  return AtomicExpansionKind::LLSC;
25176}
25177
25178TargetLowering::AtomicExpansionKind
25179AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
25180    AtomicCmpXchgInst *AI) const {
25181  // If subtarget has LSE, leave cmpxchg intact for codegen.
25182  if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
25183    return AtomicExpansionKind::None;
25184  // At -O0, fast-regalloc cannot cope with the live vregs necessary to
25185  // implement cmpxchg without spilling. If the address being exchanged is also
25186  // on the stack and close enough to the spill slot, this can lead to a
25187  // situation where the monitor always gets cleared and the atomic operation
25188  // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
25189  if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
25190    return AtomicExpansionKind::None;
25191
25192  // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
25193  // it.
25194  unsigned Size = AI->getCompareOperand()->getType()->getPrimitiveSizeInBits();
25195  if (Size > 64)
25196    return AtomicExpansionKind::None;
25197
25198  return AtomicExpansionKind::LLSC;
25199}
25200
25201Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
25202                                             Type *ValueTy, Value *Addr,
25203                                             AtomicOrdering Ord) const {
25204  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
25205  bool IsAcquire = isAcquireOrStronger(Ord);
25206
25207  // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
25208  // intrinsic must return {i64, i64} and we have to recombine them into a
25209  // single i128 here.
25210  if (ValueTy->getPrimitiveSizeInBits() == 128) {
25211    Intrinsic::ID Int =
25212        IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
25213    Function *Ldxr = Intrinsic::getDeclaration(M, Int);
25214
25215    Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
25216
25217    Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
25218    Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
25219    Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
25220    Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
25221    return Builder.CreateOr(
25222        Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 64)), "val64");
25223  }
25224
25225  Type *Tys[] = { Addr->getType() };
25226  Intrinsic::ID Int =
25227      IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
25228  Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);
25229
25230  const DataLayout &DL = M->getDataLayout();
25231  IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
25232  CallInst *CI = Builder.CreateCall(Ldxr, Addr);
25233  CI->addParamAttr(
25234      0, Attribute::get(Builder.getContext(), Attribute::ElementType, ValueTy));
25235  Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
25236
25237  return Builder.CreateBitCast(Trunc, ValueTy);
25238}
25239
25240void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
25241    IRBuilderBase &Builder) const {
25242  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
25243  Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
25244}
25245
25246Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder,
25247                                                   Value *Val, Value *Addr,
25248                                                   AtomicOrdering Ord) const {
25249  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
25250  bool IsRelease = isReleaseOrStronger(Ord);
25251
25252  // Since the intrinsics must have legal type, the i128 intrinsics take two
25253  // parameters: "i64, i64". We must marshal Val into the appropriate form
25254  // before the call.
25255  if (Val->getType()->getPrimitiveSizeInBits() == 128) {
25256    Intrinsic::ID Int =
25257        IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
25258    Function *Stxr = Intrinsic::getDeclaration(M, Int);
25259    Type *Int64Ty = Type::getInt64Ty(M->getContext());
25260
25261    Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
25262    Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
25263    return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
25264  }
25265
25266  Intrinsic::ID Int =
25267      IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
25268  Type *Tys[] = { Addr->getType() };
25269  Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
25270
25271  const DataLayout &DL = M->getDataLayout();
25272  IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
25273  Val = Builder.CreateBitCast(Val, IntValTy);
25274
25275  CallInst *CI = Builder.CreateCall(
25276      Stxr, {Builder.CreateZExtOrBitCast(
25277                 Val, Stxr->getFunctionType()->getParamType(0)),
25278             Addr});
25279  CI->addParamAttr(1, Attribute::get(Builder.getContext(),
25280                                     Attribute::ElementType, Val->getType()));
25281  return CI;
25282}
25283
25284bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
25285    Type *Ty, CallingConv::ID CallConv, bool isVarArg,
25286    const DataLayout &DL) const {
25287  if (!Ty->isArrayTy()) {
25288    const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
25289    return TySize.isScalable() && TySize.getKnownMinValue() > 128;
25290  }
25291
25292  // All non aggregate members of the type must have the same type
25293  SmallVector<EVT> ValueVTs;
25294  ComputeValueVTs(*this, DL, Ty, ValueVTs);
25295  return all_equal(ValueVTs);
25296}
25297
25298bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
25299                                                            EVT) const {
25300  return false;
25301}
25302
25303static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) {
25304  Module *M = IRB.GetInsertBlock()->getParent()->getParent();
25305  Function *ThreadPointerFunc =
25306      Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
25307  return IRB.CreatePointerCast(
25308      IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
25309                             Offset),
25310      IRB.getPtrTy(0));
25311}
25312
25313Value *AArch64TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
25314  // Android provides a fixed TLS slot for the stack cookie. See the definition
25315  // of TLS_SLOT_STACK_GUARD in
25316  // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
25317  if (Subtarget->isTargetAndroid())
25318    return UseTlsOffset(IRB, 0x28);
25319
25320  // Fuchsia is similar.
25321  // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
25322  if (Subtarget->isTargetFuchsia())
25323    return UseTlsOffset(IRB, -0x10);
25324
25325  return TargetLowering::getIRStackGuard(IRB);
25326}
25327
25328void AArch64TargetLowering::insertSSPDeclarations(Module &M) const {
25329  // MSVC CRT provides functionalities for stack protection.
25330  if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
25331    // MSVC CRT has a global variable holding security cookie.
25332    M.getOrInsertGlobal("__security_cookie",
25333                        PointerType::getUnqual(M.getContext()));
25334
25335    // MSVC CRT has a function to validate security cookie.
25336    FunctionCallee SecurityCheckCookie =
25337        M.getOrInsertFunction(Subtarget->getSecurityCheckCookieName(),
25338                              Type::getVoidTy(M.getContext()),
25339                              PointerType::getUnqual(M.getContext()));
25340    if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
25341      F->setCallingConv(CallingConv::Win64);
25342      F->addParamAttr(0, Attribute::AttrKind::InReg);
25343    }
25344    return;
25345  }
25346  TargetLowering::insertSSPDeclarations(M);
25347}
25348
25349Value *AArch64TargetLowering::getSDagStackGuard(const Module &M) const {
25350  // MSVC CRT has a global variable holding security cookie.
25351  if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
25352    return M.getGlobalVariable("__security_cookie");
25353  return TargetLowering::getSDagStackGuard(M);
25354}
25355
25356Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const {
25357  // MSVC CRT has a function to validate security cookie.
25358  if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
25359    return M.getFunction(Subtarget->getSecurityCheckCookieName());
25360  return TargetLowering::getSSPStackGuardCheck(M);
25361}
25362
25363Value *
25364AArch64TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
25365  // Android provides a fixed TLS slot for the SafeStack pointer. See the
25366  // definition of TLS_SLOT_SAFESTACK in
25367  // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
25368  if (Subtarget->isTargetAndroid())
25369    return UseTlsOffset(IRB, 0x48);
25370
25371  // Fuchsia is similar.
25372  // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
25373  if (Subtarget->isTargetFuchsia())
25374    return UseTlsOffset(IRB, -0x8);
25375
25376  return TargetLowering::getSafeStackPointerLocation(IRB);
25377}
25378
25379bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
25380    const Instruction &AndI) const {
25381  // Only sink 'and' mask to cmp use block if it is masking a single bit, since
25382  // this is likely to be fold the and/cmp/br into a single tbz instruction.  It
25383  // may be beneficial to sink in other cases, but we would have to check that
25384  // the cmp would not get folded into the br to form a cbz for these to be
25385  // beneficial.
25386  ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
25387  if (!Mask)
25388    return false;
25389  return Mask->getValue().isPowerOf2();
25390}
25391
25392bool AArch64TargetLowering::
25393    shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
25394        SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
25395        unsigned OldShiftOpcode, unsigned NewShiftOpcode,
25396        SelectionDAG &DAG) const {
25397  // Does baseline recommend not to perform the fold by default?
25398  if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
25399          X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
25400    return false;
25401  // Else, if this is a vector shift, prefer 'shl'.
25402  return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
25403}
25404
25405TargetLowering::ShiftLegalizationStrategy
25406AArch64TargetLowering::preferredShiftLegalizationStrategy(
25407    SelectionDAG &DAG, SDNode *N, unsigned int ExpansionFactor) const {
25408  if (DAG.getMachineFunction().getFunction().hasMinSize() &&
25409      !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
25410    return ShiftLegalizationStrategy::LowerToLibcall;
25411  return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,
25412                                                            ExpansionFactor);
25413}
25414
25415void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
25416  // Update IsSplitCSR in AArch64unctionInfo.
25417  AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
25418  AFI->setIsSplitCSR(true);
25419}
25420
25421void AArch64TargetLowering::insertCopiesSplitCSR(
25422    MachineBasicBlock *Entry,
25423    const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
25424  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
25425  const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
25426  if (!IStart)
25427    return;
25428
25429  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
25430  MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
25431  MachineBasicBlock::iterator MBBI = Entry->begin();
25432  for (const MCPhysReg *I = IStart; *I; ++I) {
25433    const TargetRegisterClass *RC = nullptr;
25434    if (AArch64::GPR64RegClass.contains(*I))
25435      RC = &AArch64::GPR64RegClass;
25436    else if (AArch64::FPR64RegClass.contains(*I))
25437      RC = &AArch64::FPR64RegClass;
25438    else
25439      llvm_unreachable("Unexpected register class in CSRsViaCopy!");
25440
25441    Register NewVR = MRI->createVirtualRegister(RC);
25442    // Create copy from CSR to a virtual register.
25443    // FIXME: this currently does not emit CFI pseudo-instructions, it works
25444    // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
25445    // nounwind. If we want to generalize this later, we may need to emit
25446    // CFI pseudo-instructions.
25447    assert(Entry->getParent()->getFunction().hasFnAttribute(
25448               Attribute::NoUnwind) &&
25449           "Function should be nounwind in insertCopiesSplitCSR!");
25450    Entry->addLiveIn(*I);
25451    BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
25452        .addReg(*I);
25453
25454    // Insert the copy-back instructions right before the terminator.
25455    for (auto *Exit : Exits)
25456      BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
25457              TII->get(TargetOpcode::COPY), *I)
25458          .addReg(NewVR);
25459  }
25460}
25461
25462bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
25463  // Integer division on AArch64 is expensive. However, when aggressively
25464  // optimizing for code size, we prefer to use a div instruction, as it is
25465  // usually smaller than the alternative sequence.
25466  // The exception to this is vector division. Since AArch64 doesn't have vector
25467  // integer division, leaving the division as-is is a loss even in terms of
25468  // size, because it will have to be scalarized, while the alternative code
25469  // sequence can be performed in vector form.
25470  bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
25471  return OptSize && !VT.isVector();
25472}
25473
25474bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
25475  // We want inc-of-add for scalars and sub-of-not for vectors.
25476  return VT.isScalarInteger();
25477}
25478
25479bool AArch64TargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT,
25480                                                 EVT VT) const {
25481  // v8f16 without fp16 need to be extended to v8f32, which is more difficult to
25482  // legalize.
25483  if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())
25484    return false;
25485  return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
25486}
25487
25488MachineInstr *
25489AArch64TargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,
25490                                     MachineBasicBlock::instr_iterator &MBBI,
25491                                     const TargetInstrInfo *TII) const {
25492  assert(MBBI->isCall() && MBBI->getCFIType() &&
25493         "Invalid call instruction for a KCFI check");
25494
25495  switch (MBBI->getOpcode()) {
25496  case AArch64::BLR:
25497  case AArch64::BLRNoIP:
25498  case AArch64::TCRETURNri:
25499  case AArch64::TCRETURNriBTI:
25500    break;
25501  default:
25502    llvm_unreachable("Unexpected CFI call opcode");
25503  }
25504
25505  MachineOperand &Target = MBBI->getOperand(0);
25506  assert(Target.isReg() && "Invalid target operand for an indirect call");
25507  Target.setIsRenamable(false);
25508
25509  return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(AArch64::KCFI_CHECK))
25510      .addReg(Target.getReg())
25511      .addImm(MBBI->getCFIType())
25512      .getInstr();
25513}
25514
25515bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const {
25516  return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
25517}
25518
25519unsigned
25520AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
25521  if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
25522    return getPointerTy(DL).getSizeInBits();
25523
25524  return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
25525}
25526
25527void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
25528  MachineFrameInfo &MFI = MF.getFrameInfo();
25529  // If we have any vulnerable SVE stack objects then the stack protector
25530  // needs to be placed at the top of the SVE stack area, as the SVE locals
25531  // are placed above the other locals, so we allocate it as if it were a
25532  // scalable vector.
25533  // FIXME: It may be worthwhile having a specific interface for this rather
25534  // than doing it here in finalizeLowering.
25535  if (MFI.hasStackProtectorIndex()) {
25536    for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
25537      if (MFI.getStackID(i) == TargetStackID::ScalableVector &&
25538          MFI.getObjectSSPLayout(i) != MachineFrameInfo::SSPLK_None) {
25539        MFI.setStackID(MFI.getStackProtectorIndex(),
25540                       TargetStackID::ScalableVector);
25541        MFI.setObjectAlignment(MFI.getStackProtectorIndex(), Align(16));
25542        break;
25543      }
25544    }
25545  }
25546  MFI.computeMaxCallFrameSize(MF);
25547  TargetLoweringBase::finalizeLowering(MF);
25548}
25549
25550// Unlike X86, we let frame lowering assign offsets to all catch objects.
25551bool AArch64TargetLowering::needsFixedCatchObjects() const {
25552  return false;
25553}
25554
25555bool AArch64TargetLowering::shouldLocalize(
25556    const MachineInstr &MI, const TargetTransformInfo *TTI) const {
25557  auto &MF = *MI.getMF();
25558  auto &MRI = MF.getRegInfo();
25559  auto maxUses = [](unsigned RematCost) {
25560    // A cost of 1 means remats are basically free.
25561    if (RematCost == 1)
25562      return std::numeric_limits<unsigned>::max();
25563    if (RematCost == 2)
25564      return 2U;
25565
25566    // Remat is too expensive, only sink if there's one user.
25567    if (RematCost > 2)
25568      return 1U;
25569    llvm_unreachable("Unexpected remat cost");
25570  };
25571
25572  unsigned Opc = MI.getOpcode();
25573  switch (Opc) {
25574  case TargetOpcode::G_GLOBAL_VALUE: {
25575    // On Darwin, TLS global vars get selected into function calls, which
25576    // we don't want localized, as they can get moved into the middle of a
25577    // another call sequence.
25578    const GlobalValue &GV = *MI.getOperand(1).getGlobal();
25579    if (GV.isThreadLocal() && Subtarget->isTargetMachO())
25580      return false;
25581    return true; // Always localize G_GLOBAL_VALUE to avoid high reg pressure.
25582  }
25583  case TargetOpcode::G_FCONSTANT:
25584  case TargetOpcode::G_CONSTANT: {
25585    const ConstantInt *CI;
25586    unsigned AdditionalCost = 0;
25587
25588    if (Opc == TargetOpcode::G_CONSTANT)
25589      CI = MI.getOperand(1).getCImm();
25590    else {
25591      LLT Ty = MRI.getType(MI.getOperand(0).getReg());
25592      // We try to estimate cost of 32/64b fpimms, as they'll likely be
25593      // materialized as integers.
25594      if (Ty.getScalarSizeInBits() != 32 && Ty.getScalarSizeInBits() != 64)
25595        break;
25596      auto APF = MI.getOperand(1).getFPImm()->getValueAPF();
25597      bool OptForSize =
25598          MF.getFunction().hasOptSize() || MF.getFunction().hasMinSize();
25599      if (isFPImmLegal(APF, EVT::getFloatingPointVT(Ty.getScalarSizeInBits()),
25600                       OptForSize))
25601        return true; // Constant should be cheap.
25602      CI =
25603          ConstantInt::get(MF.getFunction().getContext(), APF.bitcastToAPInt());
25604      // FP materialization also costs an extra move, from gpr to fpr.
25605      AdditionalCost = 1;
25606    }
25607    APInt Imm = CI->getValue();
25608    InstructionCost Cost = TTI->getIntImmCost(
25609        Imm, CI->getType(), TargetTransformInfo::TCK_CodeSize);
25610    assert(Cost.isValid() && "Expected a valid imm cost");
25611
25612    unsigned RematCost = *Cost.getValue();
25613    RematCost += AdditionalCost;
25614    Register Reg = MI.getOperand(0).getReg();
25615    unsigned MaxUses = maxUses(RematCost);
25616    // Don't pass UINT_MAX sentinel value to hasAtMostUserInstrs().
25617    if (MaxUses == std::numeric_limits<unsigned>::max())
25618      --MaxUses;
25619    return MRI.hasAtMostUserInstrs(Reg, MaxUses);
25620  }
25621  // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
25622  // localizable.
25623  case AArch64::ADRP:
25624  case AArch64::G_ADD_LOW:
25625  // Need to localize G_PTR_ADD so that G_GLOBAL_VALUE can be localized too.
25626  case TargetOpcode::G_PTR_ADD:
25627    return true;
25628  default:
25629    break;
25630  }
25631  return TargetLoweringBase::shouldLocalize(MI, TTI);
25632}
25633
25634bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
25635  if (Inst.getType()->isScalableTy())
25636    return true;
25637
25638  for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
25639    if (Inst.getOperand(i)->getType()->isScalableTy())
25640      return true;
25641
25642  if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
25643    if (AI->getAllocatedType()->isScalableTy())
25644      return true;
25645  }
25646
25647  // Checks to allow the use of SME instructions
25648  if (auto *Base = dyn_cast<CallBase>(&Inst)) {
25649    auto CallerAttrs = SMEAttrs(*Inst.getFunction());
25650    auto CalleeAttrs = SMEAttrs(*Base);
25651    if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
25652        CallerAttrs.requiresLazySave(CalleeAttrs))
25653      return true;
25654  }
25655  return false;
25656}
25657
25658// Return the largest legal scalable vector type that matches VT's element type.
25659static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT) {
25660  assert(VT.isFixedLengthVector() &&
25661         DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
25662         "Expected legal fixed length vector!");
25663  switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
25664  default:
25665    llvm_unreachable("unexpected element type for SVE container");
25666  case MVT::i8:
25667    return EVT(MVT::nxv16i8);
25668  case MVT::i16:
25669    return EVT(MVT::nxv8i16);
25670  case MVT::i32:
25671    return EVT(MVT::nxv4i32);
25672  case MVT::i64:
25673    return EVT(MVT::nxv2i64);
25674  case MVT::f16:
25675    return EVT(MVT::nxv8f16);
25676  case MVT::f32:
25677    return EVT(MVT::nxv4f32);
25678  case MVT::f64:
25679    return EVT(MVT::nxv2f64);
25680  }
25681}
25682
25683// Return a PTRUE with active lanes corresponding to the extent of VT.
25684static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL,
25685                                                EVT VT) {
25686  assert(VT.isFixedLengthVector() &&
25687         DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
25688         "Expected legal fixed length vector!");
25689
25690  std::optional<unsigned> PgPattern =
25691      getSVEPredPatternFromNumElements(VT.getVectorNumElements());
25692  assert(PgPattern && "Unexpected element count for SVE predicate");
25693
25694  // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
25695  // AArch64SVEPredPattern::all, which can enable the use of unpredicated
25696  // variants of instructions when available.
25697  const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
25698  unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
25699  unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
25700  if (MaxSVESize && MinSVESize == MaxSVESize &&
25701      MaxSVESize == VT.getSizeInBits())
25702    PgPattern = AArch64SVEPredPattern::all;
25703
25704  MVT MaskVT;
25705  switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
25706  default:
25707    llvm_unreachable("unexpected element type for SVE predicate");
25708  case MVT::i8:
25709    MaskVT = MVT::nxv16i1;
25710    break;
25711  case MVT::i16:
25712  case MVT::f16:
25713    MaskVT = MVT::nxv8i1;
25714    break;
25715  case MVT::i32:
25716  case MVT::f32:
25717    MaskVT = MVT::nxv4i1;
25718    break;
25719  case MVT::i64:
25720  case MVT::f64:
25721    MaskVT = MVT::nxv2i1;
25722    break;
25723  }
25724
25725  return getPTrue(DAG, DL, MaskVT, *PgPattern);
25726}
25727
25728static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
25729                                             EVT VT) {
25730  assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
25731         "Expected legal scalable vector!");
25732  auto PredTy = VT.changeVectorElementType(MVT::i1);
25733  return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
25734}
25735
25736static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT) {
25737  if (VT.isFixedLengthVector())
25738    return getPredicateForFixedLengthVector(DAG, DL, VT);
25739
25740  return getPredicateForScalableVector(DAG, DL, VT);
25741}
25742
25743// Grow V to consume an entire SVE register.
25744static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
25745  assert(VT.isScalableVector() &&
25746         "Expected to convert into a scalable vector!");
25747  assert(V.getValueType().isFixedLengthVector() &&
25748         "Expected a fixed length vector operand!");
25749  SDLoc DL(V);
25750  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
25751  return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
25752}
25753
25754// Shrink V so it's just big enough to maintain a VT's worth of data.
25755static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
25756  assert(VT.isFixedLengthVector() &&
25757         "Expected to convert into a fixed length vector!");
25758  assert(V.getValueType().isScalableVector() &&
25759         "Expected a scalable vector operand!");
25760  SDLoc DL(V);
25761  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
25762  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
25763}
25764
25765// Convert all fixed length vector loads larger than NEON to masked_loads.
25766SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
25767    SDValue Op, SelectionDAG &DAG) const {
25768  auto Load = cast<LoadSDNode>(Op);
25769
25770  SDLoc DL(Op);
25771  EVT VT = Op.getValueType();
25772  EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
25773  EVT LoadVT = ContainerVT;
25774  EVT MemVT = Load->getMemoryVT();
25775
25776  auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
25777
25778  if (VT.isFloatingPoint()) {
25779    LoadVT = ContainerVT.changeTypeToInteger();
25780    MemVT = MemVT.changeTypeToInteger();
25781  }
25782
25783  SDValue NewLoad = DAG.getMaskedLoad(
25784      LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,
25785      DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(),
25786      Load->getAddressingMode(), Load->getExtensionType());
25787
25788  SDValue Result = NewLoad;
25789  if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
25790    EVT ExtendVT = ContainerVT.changeVectorElementType(
25791        Load->getMemoryVT().getVectorElementType());
25792
25793    Result = getSVESafeBitCast(ExtendVT, Result, DAG);
25794    Result = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
25795                         Pg, Result, DAG.getUNDEF(ContainerVT));
25796  } else if (VT.isFloatingPoint()) {
25797    Result = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Result);
25798  }
25799
25800  Result = convertFromScalableVector(DAG, VT, Result);
25801  SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
25802  return DAG.getMergeValues(MergedValues, DL);
25803}
25804
25805static SDValue convertFixedMaskToScalableVector(SDValue Mask,
25806                                                SelectionDAG &DAG) {
25807  SDLoc DL(Mask);
25808  EVT InVT = Mask.getValueType();
25809  EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
25810
25811  auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
25812
25813  if (ISD::isBuildVectorAllOnes(Mask.getNode()))
25814    return Pg;
25815
25816  auto Op1 = convertToScalableVector(DAG, ContainerVT, Mask);
25817  auto Op2 = DAG.getConstant(0, DL, ContainerVT);
25818
25819  return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, Pg.getValueType(),
25820                     {Pg, Op1, Op2, DAG.getCondCode(ISD::SETNE)});
25821}
25822
25823// Convert all fixed length vector loads larger than NEON to masked_loads.
25824SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
25825    SDValue Op, SelectionDAG &DAG) const {
25826  auto Load = cast<MaskedLoadSDNode>(Op);
25827
25828  SDLoc DL(Op);
25829  EVT VT = Op.getValueType();
25830  EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
25831
25832  SDValue Mask = Load->getMask();
25833  // If this is an extending load and the mask type is not the same as
25834  // load's type then we have to extend the mask type.
25835  if (VT.getScalarSizeInBits() > Mask.getValueType().getScalarSizeInBits()) {
25836    assert(Load->getExtensionType() != ISD::NON_EXTLOAD &&
25837           "Incorrect mask type");
25838    Mask = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Mask);
25839  }
25840  Mask = convertFixedMaskToScalableVector(Mask, DAG);
25841
25842  SDValue PassThru;
25843  bool IsPassThruZeroOrUndef = false;
25844
25845  if (Load->getPassThru()->isUndef()) {
25846    PassThru = DAG.getUNDEF(ContainerVT);
25847    IsPassThruZeroOrUndef = true;
25848  } else {
25849    if (ContainerVT.isInteger())
25850      PassThru = DAG.getConstant(0, DL, ContainerVT);
25851    else
25852      PassThru = DAG.getConstantFP(0, DL, ContainerVT);
25853    if (isZerosVector(Load->getPassThru().getNode()))
25854      IsPassThruZeroOrUndef = true;
25855  }
25856
25857  SDValue NewLoad = DAG.getMaskedLoad(
25858      ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
25859      Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(),
25860      Load->getAddressingMode(), Load->getExtensionType());
25861
25862  SDValue Result = NewLoad;
25863  if (!IsPassThruZeroOrUndef) {
25864    SDValue OldPassThru =
25865        convertToScalableVector(DAG, ContainerVT, Load->getPassThru());
25866    Result = DAG.getSelect(DL, ContainerVT, Mask, Result, OldPassThru);
25867  }
25868
25869  Result = convertFromScalableVector(DAG, VT, Result);
25870  SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
25871  return DAG.getMergeValues(MergedValues, DL);
25872}
25873
25874// Convert all fixed length vector stores larger than NEON to masked_stores.
25875SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
25876    SDValue Op, SelectionDAG &DAG) const {
25877  auto Store = cast<StoreSDNode>(Op);
25878
25879  SDLoc DL(Op);
25880  EVT VT = Store->getValue().getValueType();
25881  EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
25882  EVT MemVT = Store->getMemoryVT();
25883
25884  auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
25885  auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
25886
25887  if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
25888    EVT TruncVT = ContainerVT.changeVectorElementType(
25889        Store->getMemoryVT().getVectorElementType());
25890    MemVT = MemVT.changeTypeToInteger();
25891    NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,
25892                           NewValue, DAG.getTargetConstant(0, DL, MVT::i64),
25893                           DAG.getUNDEF(TruncVT));
25894    NewValue =
25895        getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
25896  } else if (VT.isFloatingPoint()) {
25897    MemVT = MemVT.changeTypeToInteger();
25898    NewValue =
25899        getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
25900  }
25901
25902  return DAG.getMaskedStore(Store->getChain(), DL, NewValue,
25903                            Store->getBasePtr(), Store->getOffset(), Pg, MemVT,
25904                            Store->getMemOperand(), Store->getAddressingMode(),
25905                            Store->isTruncatingStore());
25906}
25907
25908SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
25909    SDValue Op, SelectionDAG &DAG) const {
25910  auto *Store = cast<MaskedStoreSDNode>(Op);
25911
25912  SDLoc DL(Op);
25913  EVT VT = Store->getValue().getValueType();
25914  EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
25915
25916  auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
25917  SDValue Mask = convertFixedMaskToScalableVector(Store->getMask(), DAG);
25918
25919  return DAG.getMaskedStore(
25920      Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
25921      Mask, Store->getMemoryVT(), Store->getMemOperand(),
25922      Store->getAddressingMode(), Store->isTruncatingStore());
25923}
25924
25925SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
25926    SDValue Op, SelectionDAG &DAG) const {
25927  SDLoc dl(Op);
25928  EVT VT = Op.getValueType();
25929  EVT EltVT = VT.getVectorElementType();
25930
25931  bool Signed = Op.getOpcode() == ISD::SDIV;
25932  unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
25933
25934  bool Negated;
25935  uint64_t SplatVal;
25936  if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
25937    EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
25938    SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
25939    SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32);
25940
25941    SDValue Pg = getPredicateForFixedLengthVector(DAG, dl, VT);
25942    SDValue Res =
25943        DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, ContainerVT, Pg, Op1, Op2);
25944    if (Negated)
25945      Res = DAG.getNode(ISD::SUB, dl, ContainerVT,
25946                        DAG.getConstant(0, dl, ContainerVT), Res);
25947
25948    return convertFromScalableVector(DAG, VT, Res);
25949  }
25950
25951  // Scalable vector i32/i64 DIV is supported.
25952  if (EltVT == MVT::i32 || EltVT == MVT::i64)
25953    return LowerToPredicatedOp(Op, DAG, PredOpcode);
25954
25955  // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
25956  EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
25957  EVT PromVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
25958  unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
25959
25960  // If the wider type is legal: extend, op, and truncate.
25961  EVT WideVT = VT.widenIntegerVectorElementType(*DAG.getContext());
25962  if (DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) {
25963    SDValue Op0 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(0));
25964    SDValue Op1 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(1));
25965    SDValue Div = DAG.getNode(Op.getOpcode(), dl, WideVT, Op0, Op1);
25966    return DAG.getNode(ISD::TRUNCATE, dl, VT, Div);
25967  }
25968
25969  auto HalveAndExtendVector = [&DAG, &dl, &HalfVT, &PromVT,
25970                               &ExtendOpcode](SDValue Op) {
25971    SDValue IdxZero = DAG.getConstant(0, dl, MVT::i64);
25972    SDValue IdxHalf =
25973        DAG.getConstant(HalfVT.getVectorNumElements(), dl, MVT::i64);
25974    SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxZero);
25975    SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxHalf);
25976    return std::pair<SDValue, SDValue>(
25977        {DAG.getNode(ExtendOpcode, dl, PromVT, Lo),
25978         DAG.getNode(ExtendOpcode, dl, PromVT, Hi)});
25979  };
25980
25981  // If wider type is not legal: split, extend, op, trunc and concat.
25982  auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector(Op.getOperand(0));
25983  auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector(Op.getOperand(1));
25984  SDValue Lo = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0LoExt, Op1LoExt);
25985  SDValue Hi = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0HiExt, Op1HiExt);
25986  SDValue LoTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Lo);
25987  SDValue HiTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Hi);
25988  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, {LoTrunc, HiTrunc});
25989}
25990
25991SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
25992    SDValue Op, SelectionDAG &DAG) const {
25993  EVT VT = Op.getValueType();
25994  assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
25995
25996  SDLoc DL(Op);
25997  SDValue Val = Op.getOperand(0);
25998  EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
25999  Val = convertToScalableVector(DAG, ContainerVT, Val);
26000
26001  bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
26002  unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
26003
26004  // Repeatedly unpack Val until the result is of the desired element type.
26005  switch (ContainerVT.getSimpleVT().SimpleTy) {
26006  default:
26007    llvm_unreachable("unimplemented container type");
26008  case MVT::nxv16i8:
26009    Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
26010    if (VT.getVectorElementType() == MVT::i16)
26011      break;
26012    [[fallthrough]];
26013  case MVT::nxv8i16:
26014    Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
26015    if (VT.getVectorElementType() == MVT::i32)
26016      break;
26017    [[fallthrough]];
26018  case MVT::nxv4i32:
26019    Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
26020    assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
26021    break;
26022  }
26023
26024  return convertFromScalableVector(DAG, VT, Val);
26025}
26026
26027SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
26028    SDValue Op, SelectionDAG &DAG) const {
26029  EVT VT = Op.getValueType();
26030  assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26031
26032  SDLoc DL(Op);
26033  SDValue Val = Op.getOperand(0);
26034  EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
26035  Val = convertToScalableVector(DAG, ContainerVT, Val);
26036
26037  // Repeatedly truncate Val until the result is of the desired element type.
26038  switch (ContainerVT.getSimpleVT().SimpleTy) {
26039  default:
26040    llvm_unreachable("unimplemented container type");
26041  case MVT::nxv2i64:
26042    Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
26043    Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
26044    if (VT.getVectorElementType() == MVT::i32)
26045      break;
26046    [[fallthrough]];
26047  case MVT::nxv4i32:
26048    Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
26049    Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
26050    if (VT.getVectorElementType() == MVT::i16)
26051      break;
26052    [[fallthrough]];
26053  case MVT::nxv8i16:
26054    Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
26055    Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
26056    assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
26057    break;
26058  }
26059
26060  return convertFromScalableVector(DAG, VT, Val);
26061}
26062
26063SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
26064    SDValue Op, SelectionDAG &DAG) const {
26065  EVT VT = Op.getValueType();
26066  EVT InVT = Op.getOperand(0).getValueType();
26067  assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
26068
26069  SDLoc DL(Op);
26070  EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
26071  SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
26072
26073  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1));
26074}
26075
26076SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
26077    SDValue Op, SelectionDAG &DAG) const {
26078  EVT VT = Op.getValueType();
26079  assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26080
26081  SDLoc DL(Op);
26082  EVT InVT = Op.getOperand(0).getValueType();
26083  EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
26084  SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
26085
26086  auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0,
26087                                 Op.getOperand(1), Op.getOperand(2));
26088
26089  return convertFromScalableVector(DAG, VT, ScalableRes);
26090}
26091
26092// Convert vector operation 'Op' to an equivalent predicated operation whereby
26093// the original operation's type is used to construct a suitable predicate.
26094// NOTE: The results for inactive lanes are undefined.
26095SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
26096                                                   SelectionDAG &DAG,
26097                                                   unsigned NewOp) const {
26098  EVT VT = Op.getValueType();
26099  SDLoc DL(Op);
26100  auto Pg = getPredicateForVector(DAG, DL, VT);
26101
26102  if (VT.isFixedLengthVector()) {
26103    assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
26104    EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26105
26106    // Create list of operands by converting existing ones to scalable types.
26107    SmallVector<SDValue, 4> Operands = {Pg};
26108    for (const SDValue &V : Op->op_values()) {
26109      if (isa<CondCodeSDNode>(V)) {
26110        Operands.push_back(V);
26111        continue;
26112      }
26113
26114      if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
26115        EVT VTArg = VTNode->getVT().getVectorElementType();
26116        EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg);
26117        Operands.push_back(DAG.getValueType(NewVTArg));
26118        continue;
26119      }
26120
26121      assert(isTypeLegal(V.getValueType()) &&
26122             "Expected only legal fixed-width types");
26123      Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
26124    }
26125
26126    if (isMergePassthruOpcode(NewOp))
26127      Operands.push_back(DAG.getUNDEF(ContainerVT));
26128
26129    auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
26130    return convertFromScalableVector(DAG, VT, ScalableRes);
26131  }
26132
26133  assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
26134
26135  SmallVector<SDValue, 4> Operands = {Pg};
26136  for (const SDValue &V : Op->op_values()) {
26137    assert((!V.getValueType().isVector() ||
26138            V.getValueType().isScalableVector()) &&
26139           "Only scalable vectors are supported!");
26140    Operands.push_back(V);
26141  }
26142
26143  if (isMergePassthruOpcode(NewOp))
26144    Operands.push_back(DAG.getUNDEF(VT));
26145
26146  return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags());
26147}
26148
26149// If a fixed length vector operation has no side effects when applied to
26150// undefined elements, we can safely use scalable vectors to perform the same
26151// operation without needing to worry about predication.
26152SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
26153                                                 SelectionDAG &DAG) const {
26154  EVT VT = Op.getValueType();
26155  assert(VT.isFixedLengthVector() && isTypeLegal(VT) &&
26156         "Only expected to lower fixed length vector operation!");
26157  EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26158
26159  // Create list of operands by converting existing ones to scalable types.
26160  SmallVector<SDValue, 4> Ops;
26161  for (const SDValue &V : Op->op_values()) {
26162    assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
26163
26164    // Pass through non-vector operands.
26165    if (!V.getValueType().isVector()) {
26166      Ops.push_back(V);
26167      continue;
26168    }
26169
26170    // "cast" fixed length vector to a scalable vector.
26171    assert(V.getValueType().isFixedLengthVector() &&
26172           isTypeLegal(V.getValueType()) &&
26173           "Only fixed length vectors are supported!");
26174    Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
26175  }
26176
26177  auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
26178  return convertFromScalableVector(DAG, VT, ScalableRes);
26179}
26180
26181SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
26182    SelectionDAG &DAG) const {
26183  SDLoc DL(ScalarOp);
26184  SDValue AccOp = ScalarOp.getOperand(0);
26185  SDValue VecOp = ScalarOp.getOperand(1);
26186  EVT SrcVT = VecOp.getValueType();
26187  EVT ResVT = SrcVT.getVectorElementType();
26188
26189  EVT ContainerVT = SrcVT;
26190  if (SrcVT.isFixedLengthVector()) {
26191    ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
26192    VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
26193  }
26194
26195  SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
26196  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
26197
26198  // Convert operands to Scalable.
26199  AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
26200                      DAG.getUNDEF(ContainerVT), AccOp, Zero);
26201
26202  // Perform reduction.
26203  SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,
26204                            Pg, AccOp, VecOp);
26205
26206  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
26207}
26208
26209SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
26210                                                       SelectionDAG &DAG) const {
26211  SDLoc DL(ReduceOp);
26212  SDValue Op = ReduceOp.getOperand(0);
26213  EVT OpVT = Op.getValueType();
26214  EVT VT = ReduceOp.getValueType();
26215
26216  if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
26217    return SDValue();
26218
26219  SDValue Pg = getPredicateForVector(DAG, DL, OpVT);
26220
26221  switch (ReduceOp.getOpcode()) {
26222  default:
26223    return SDValue();
26224  case ISD::VECREDUCE_OR:
26225    if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1)
26226      // The predicate can be 'Op' because
26227      // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
26228      return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE);
26229    else
26230      return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
26231  case ISD::VECREDUCE_AND: {
26232    Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
26233    return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
26234  }
26235  case ISD::VECREDUCE_XOR: {
26236    SDValue ID =
26237        DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
26238    if (OpVT == MVT::nxv1i1) {
26239      // Emulate a CNTP on .Q using .D and a different governing predicate.
26240      Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg);
26241      Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op);
26242    }
26243    SDValue Cntp =
26244        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
26245    return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
26246  }
26247  }
26248
26249  return SDValue();
26250}
26251
26252SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
26253                                                   SDValue ScalarOp,
26254                                                   SelectionDAG &DAG) const {
26255  SDLoc DL(ScalarOp);
26256  SDValue VecOp = ScalarOp.getOperand(0);
26257  EVT SrcVT = VecOp.getValueType();
26258
26259  if (useSVEForFixedLengthVectorVT(
26260          SrcVT,
26261          /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
26262    EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
26263    VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
26264  }
26265
26266  // UADDV always returns an i64 result.
26267  EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
26268                                                   SrcVT.getVectorElementType();
26269  EVT RdxVT = SrcVT;
26270  if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
26271    RdxVT = getPackedSVEVectorVT(ResVT);
26272
26273  SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
26274  SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
26275  SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
26276                            Rdx, DAG.getConstant(0, DL, MVT::i64));
26277
26278  // The VEC_REDUCE nodes expect an element size result.
26279  if (ResVT != ScalarOp.getValueType())
26280    Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType());
26281
26282  return Res;
26283}
26284
26285SDValue
26286AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
26287    SelectionDAG &DAG) const {
26288  EVT VT = Op.getValueType();
26289  SDLoc DL(Op);
26290
26291  EVT InVT = Op.getOperand(1).getValueType();
26292  EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
26293  SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));
26294  SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));
26295
26296  // Convert the mask to a predicated (NOTE: We don't need to worry about
26297  // inactive lanes since VSELECT is safe when given undefined elements).
26298  EVT MaskVT = Op.getOperand(0).getValueType();
26299  EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT);
26300  auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
26301  Mask = DAG.getNode(ISD::TRUNCATE, DL,
26302                     MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
26303
26304  auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,
26305                                Mask, Op1, Op2);
26306
26307  return convertFromScalableVector(DAG, VT, ScalableRes);
26308}
26309
26310SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
26311    SDValue Op, SelectionDAG &DAG) const {
26312  SDLoc DL(Op);
26313  EVT InVT = Op.getOperand(0).getValueType();
26314  EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
26315
26316  assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) &&
26317         "Only expected to lower fixed length vector operation!");
26318  assert(Op.getValueType() == InVT.changeTypeToInteger() &&
26319         "Expected integer result of the same bit length as the inputs!");
26320
26321  auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
26322  auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
26323  auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
26324
26325  EVT CmpVT = Pg.getValueType();
26326  auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
26327                         {Pg, Op1, Op2, Op.getOperand(2)});
26328
26329  EVT PromoteVT = ContainerVT.changeTypeToInteger();
26330  auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);
26331  return convertFromScalableVector(DAG, Op.getValueType(), Promote);
26332}
26333
26334SDValue
26335AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
26336                                                    SelectionDAG &DAG) const {
26337  SDLoc DL(Op);
26338  auto SrcOp = Op.getOperand(0);
26339  EVT VT = Op.getValueType();
26340  EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
26341  EVT ContainerSrcVT =
26342      getContainerForFixedLengthVector(DAG, SrcOp.getValueType());
26343
26344  SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp);
26345  Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp);
26346  return convertFromScalableVector(DAG, VT, Op);
26347}
26348
26349SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
26350    SDValue Op, SelectionDAG &DAG) const {
26351  SDLoc DL(Op);
26352  unsigned NumOperands = Op->getNumOperands();
26353
26354  assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
26355         "Unexpected number of operands in CONCAT_VECTORS");
26356
26357  auto SrcOp1 = Op.getOperand(0);
26358  auto SrcOp2 = Op.getOperand(1);
26359  EVT VT = Op.getValueType();
26360  EVT SrcVT = SrcOp1.getValueType();
26361
26362  if (NumOperands > 2) {
26363    SmallVector<SDValue, 4> Ops;
26364    EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
26365    for (unsigned I = 0; I < NumOperands; I += 2)
26366      Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT,
26367                                Op->getOperand(I), Op->getOperand(I + 1)));
26368
26369    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
26370  }
26371
26372  EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26373
26374  SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, SrcVT);
26375  SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1);
26376  SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2);
26377
26378  Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2);
26379
26380  return convertFromScalableVector(DAG, VT, Op);
26381}
26382
26383SDValue
26384AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
26385                                                     SelectionDAG &DAG) const {
26386  EVT VT = Op.getValueType();
26387  assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26388
26389  SDLoc DL(Op);
26390  SDValue Val = Op.getOperand(0);
26391  SDValue Pg = getPredicateForVector(DAG, DL, VT);
26392  EVT SrcVT = Val.getValueType();
26393  EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26394  EVT ExtendVT = ContainerVT.changeVectorElementType(
26395      SrcVT.getVectorElementType());
26396
26397  Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
26398  Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val);
26399
26400  Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val);
26401  Val = getSVESafeBitCast(ExtendVT, Val, DAG);
26402  Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
26403                    Pg, Val, DAG.getUNDEF(ContainerVT));
26404
26405  return convertFromScalableVector(DAG, VT, Val);
26406}
26407
26408SDValue
26409AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
26410                                                    SelectionDAG &DAG) const {
26411  EVT VT = Op.getValueType();
26412  assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26413
26414  SDLoc DL(Op);
26415  SDValue Val = Op.getOperand(0);
26416  EVT SrcVT = Val.getValueType();
26417  EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
26418  EVT RoundVT = ContainerSrcVT.changeVectorElementType(
26419      VT.getVectorElementType());
26420  SDValue Pg = getPredicateForVector(DAG, DL, RoundVT);
26421
26422  Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
26423  Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val,
26424                    Op.getOperand(1), DAG.getUNDEF(RoundVT));
26425  Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG);
26426  Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
26427
26428  Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
26429  return DAG.getNode(ISD::BITCAST, DL, VT, Val);
26430}
26431
26432SDValue
26433AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
26434                                                    SelectionDAG &DAG) const {
26435  EVT VT = Op.getValueType();
26436  assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26437
26438  bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
26439  unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
26440                             : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
26441
26442  SDLoc DL(Op);
26443  SDValue Val = Op.getOperand(0);
26444  EVT SrcVT = Val.getValueType();
26445  EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
26446  EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
26447
26448  if (VT.bitsGE(SrcVT)) {
26449    SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
26450
26451    Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
26452                      VT.changeTypeToInteger(), Val);
26453
26454    // Safe to use a larger than specified operand because by promoting the
26455    // value nothing has changed from an arithmetic point of view.
26456    Val =
26457        convertToScalableVector(DAG, ContainerDstVT.changeTypeToInteger(), Val);
26458    Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
26459                      DAG.getUNDEF(ContainerDstVT));
26460    return convertFromScalableVector(DAG, VT, Val);
26461  } else {
26462    EVT CvtVT = ContainerSrcVT.changeVectorElementType(
26463        ContainerDstVT.getVectorElementType());
26464    SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, SrcVT);
26465
26466    Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
26467    Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
26468    Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG);
26469    Val = convertFromScalableVector(DAG, SrcVT, Val);
26470
26471    Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
26472    return DAG.getNode(ISD::BITCAST, DL, VT, Val);
26473  }
26474}
26475
26476SDValue
26477AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
26478                                                SelectionDAG &DAG) const {
26479  SDLoc DL(Op);
26480  EVT OpVT = Op.getValueType();
26481  assert(OpVT.isScalableVector() &&
26482         "Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
26483  SDValue Even = DAG.getNode(AArch64ISD::UZP1, DL, OpVT, Op.getOperand(0),
26484                             Op.getOperand(1));
26485  SDValue Odd = DAG.getNode(AArch64ISD::UZP2, DL, OpVT, Op.getOperand(0),
26486                            Op.getOperand(1));
26487  return DAG.getMergeValues({Even, Odd}, DL);
26488}
26489
26490SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
26491                                                      SelectionDAG &DAG) const {
26492  SDLoc DL(Op);
26493  EVT OpVT = Op.getValueType();
26494  assert(OpVT.isScalableVector() &&
26495         "Expected scalable vector in LowerVECTOR_INTERLEAVE.");
26496
26497  SDValue Lo = DAG.getNode(AArch64ISD::ZIP1, DL, OpVT, Op.getOperand(0),
26498                           Op.getOperand(1));
26499  SDValue Hi = DAG.getNode(AArch64ISD::ZIP2, DL, OpVT, Op.getOperand(0),
26500                           Op.getOperand(1));
26501  return DAG.getMergeValues({Lo, Hi}, DL);
26502}
26503
26504SDValue
26505AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
26506                                                    SelectionDAG &DAG) const {
26507  EVT VT = Op.getValueType();
26508  assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26509
26510  bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
26511  unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
26512                             : AArch64ISD::FCVTZU_MERGE_PASSTHRU;
26513
26514  SDLoc DL(Op);
26515  SDValue Val = Op.getOperand(0);
26516  EVT SrcVT = Val.getValueType();
26517  EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
26518  EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
26519
26520  if (VT.bitsGT(SrcVT)) {
26521    EVT CvtVT = ContainerDstVT.changeVectorElementType(
26522      ContainerSrcVT.getVectorElementType());
26523    SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
26524
26525    Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
26526    Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val);
26527
26528    Val = convertToScalableVector(DAG, ContainerDstVT, Val);
26529    Val = getSVESafeBitCast(CvtVT, Val, DAG);
26530    Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
26531                      DAG.getUNDEF(ContainerDstVT));
26532    return convertFromScalableVector(DAG, VT, Val);
26533  } else {
26534    EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
26535    SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, SrcVT);
26536
26537    // Safe to use a larger than specified result since an fp_to_int where the
26538    // result doesn't fit into the destination is undefined.
26539    Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
26540    Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
26541    Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
26542
26543    return DAG.getNode(ISD::TRUNCATE, DL, VT, Val);
26544  }
26545}
26546
26547static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2,
26548                                         ArrayRef<int> ShuffleMask, EVT VT,
26549                                         EVT ContainerVT, SelectionDAG &DAG) {
26550  auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
26551  SDLoc DL(Op);
26552  unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
26553  unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
26554  bool IsSingleOp =
26555      ShuffleVectorInst::isSingleSourceMask(ShuffleMask, ShuffleMask.size());
26556
26557  if (!Subtarget.isNeonAvailable() && !MinSVESize)
26558    MinSVESize = 128;
26559
26560  // Ignore two operands if no SVE2 or all index numbers couldn't
26561  // be represented.
26562  if (!IsSingleOp && (!Subtarget.hasSVE2() || MinSVESize != MaxSVESize))
26563    return SDValue();
26564
26565  EVT VTOp1 = Op.getOperand(0).getValueType();
26566  unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();
26567  unsigned IndexLen = MinSVESize / BitsPerElt;
26568  unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
26569  uint64_t MaxOffset = APInt(BitsPerElt, -1, false).getZExtValue();
26570  assert(ElementsPerVectorReg <= IndexLen && ShuffleMask.size() <= IndexLen &&
26571         "Incorrectly legalised shuffle operation");
26572
26573  SmallVector<SDValue, 8> TBLMask;
26574  for (int Index : ShuffleMask) {
26575    // Handling poison index value.
26576    if (Index < 0)
26577      Index = 0;
26578    // If we refer to the second operand then we have to add elements
26579    // number in hardware register minus number of elements in a type.
26580    if ((unsigned)Index >= ElementsPerVectorReg)
26581      Index += IndexLen - ElementsPerVectorReg;
26582    // For 8-bit elements and 1024-bit SVE registers and MaxOffset equals
26583    // to 255, this might point to the last element of in the second operand
26584    // of the shufflevector, thus we are rejecting this transform.
26585    if ((unsigned)Index >= MaxOffset)
26586      return SDValue();
26587    TBLMask.push_back(DAG.getConstant(Index, DL, MVT::i64));
26588  }
26589
26590  // Choosing an out-of-range index leads to the lane being zeroed vs zero
26591  // value where it would perform first lane duplication for out of
26592  // index elements. For i8 elements an out-of-range index could be a valid
26593  // for 2048-bit vector register size.
26594  for (unsigned i = 0; i < IndexLen - ElementsPerVectorReg; ++i)
26595    TBLMask.push_back(DAG.getConstant((int)MaxOffset, DL, MVT::i64));
26596
26597  EVT MaskEltType = EVT::getIntegerVT(*DAG.getContext(), BitsPerElt);
26598  EVT MaskType = EVT::getVectorVT(*DAG.getContext(), MaskEltType, IndexLen);
26599  EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskType);
26600  SDValue VecMask =
26601      DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
26602  SDValue SVEMask = convertToScalableVector(DAG, MaskContainerVT, VecMask);
26603
26604  SDValue Shuffle;
26605  if (IsSingleOp)
26606    Shuffle =
26607        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
26608                    DAG.getConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32),
26609                    Op1, SVEMask);
26610  else if (Subtarget.hasSVE2())
26611    Shuffle =
26612        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
26613                    DAG.getConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32),
26614                    Op1, Op2, SVEMask);
26615  else
26616    llvm_unreachable("Cannot lower shuffle without SVE2 TBL");
26617  Shuffle = convertFromScalableVector(DAG, VT, Shuffle);
26618  return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
26619}
26620
26621SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
26622    SDValue Op, SelectionDAG &DAG) const {
26623  EVT VT = Op.getValueType();
26624  assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26625
26626  auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
26627  auto ShuffleMask = SVN->getMask();
26628
26629  SDLoc DL(Op);
26630  SDValue Op1 = Op.getOperand(0);
26631  SDValue Op2 = Op.getOperand(1);
26632
26633  EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26634  Op1 = convertToScalableVector(DAG, ContainerVT, Op1);
26635  Op2 = convertToScalableVector(DAG, ContainerVT, Op2);
26636
26637  auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT {
26638    if (ScalarTy == MVT::i8 || ScalarTy == MVT::i16)
26639      return MVT::i32;
26640    return ScalarTy;
26641  };
26642
26643  if (SVN->isSplat()) {
26644    unsigned Lane = std::max(0, SVN->getSplatIndex());
26645    EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
26646    SDValue SplatEl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
26647                                  DAG.getConstant(Lane, DL, MVT::i64));
26648    Op = DAG.getNode(ISD::SPLAT_VECTOR, DL, ContainerVT, SplatEl);
26649    return convertFromScalableVector(DAG, VT, Op);
26650  }
26651
26652  bool ReverseEXT = false;
26653  unsigned Imm;
26654  if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) &&
26655      Imm == VT.getVectorNumElements() - 1) {
26656    if (ReverseEXT)
26657      std::swap(Op1, Op2);
26658    EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
26659    SDValue Scalar = DAG.getNode(
26660        ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
26661        DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64));
26662    Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar);
26663    return convertFromScalableVector(DAG, VT, Op);
26664  }
26665
26666  for (unsigned LaneSize : {64U, 32U, 16U}) {
26667    if (isREVMask(ShuffleMask, VT, LaneSize)) {
26668      EVT NewVT =
26669          getPackedSVEVectorVT(EVT::getIntegerVT(*DAG.getContext(), LaneSize));
26670      unsigned RevOp;
26671      unsigned EltSz = VT.getScalarSizeInBits();
26672      if (EltSz == 8)
26673        RevOp = AArch64ISD::BSWAP_MERGE_PASSTHRU;
26674      else if (EltSz == 16)
26675        RevOp = AArch64ISD::REVH_MERGE_PASSTHRU;
26676      else
26677        RevOp = AArch64ISD::REVW_MERGE_PASSTHRU;
26678
26679      Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
26680      Op = LowerToPredicatedOp(Op, DAG, RevOp);
26681      Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
26682      return convertFromScalableVector(DAG, VT, Op);
26683    }
26684  }
26685
26686  if (Subtarget->hasSVE2p1() && VT.getScalarSizeInBits() == 64 &&
26687      isREVMask(ShuffleMask, VT, 128)) {
26688    if (!VT.isFloatingPoint())
26689      return LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
26690
26691    EVT NewVT = getPackedSVEVectorVT(EVT::getIntegerVT(*DAG.getContext(), 64));
26692    Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
26693    Op = LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
26694    Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
26695    return convertFromScalableVector(DAG, VT, Op);
26696  }
26697
26698  unsigned WhichResult;
26699  if (isZIPMask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
26700    return convertFromScalableVector(
26701        DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op2));
26702
26703  if (isTRNMask(ShuffleMask, VT, WhichResult)) {
26704    unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
26705    return convertFromScalableVector(
26706        DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
26707  }
26708
26709  if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
26710    return convertFromScalableVector(
26711        DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op1));
26712
26713  if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
26714    unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
26715    return convertFromScalableVector(
26716        DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
26717  }
26718
26719  // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
26720  // represents the same logical operation as performed by a ZIP instruction. In
26721  // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
26722  // equivalent to an AArch64 instruction. There's the extra component of
26723  // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
26724  // only operated on 64/128bit vector types that have a direct mapping to a
26725  // target register and so an exact mapping is implied.
26726  // However, when using SVE for fixed length vectors, most legal vector types
26727  // are actually sub-vectors of a larger SVE register. When mapping
26728  // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
26729  // how the mask's indices translate. Specifically, when the mapping requires
26730  // an exact meaning for a specific vector index (e.g. Index X is the last
26731  // vector element in the register) then such mappings are often only safe when
26732  // the exact SVE register size is know. The main exception to this is when
26733  // indices are logically relative to the first element of either
26734  // ISD::VECTOR_SHUFFLE operand because these relative indices don't change
26735  // when converting from fixed-length to scalable vector types (i.e. the start
26736  // of a fixed length vector is always the start of a scalable vector).
26737  unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
26738  unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
26739  if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
26740    if (ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size()) &&
26741        Op2.isUndef()) {
26742      Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1);
26743      return convertFromScalableVector(DAG, VT, Op);
26744    }
26745
26746    if (isZIPMask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
26747      return convertFromScalableVector(
26748          DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op2));
26749
26750    if (isUZPMask(ShuffleMask, VT, WhichResult)) {
26751      unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
26752      return convertFromScalableVector(
26753          DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
26754    }
26755
26756    if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
26757      return convertFromScalableVector(
26758          DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op1));
26759
26760    if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
26761      unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
26762      return convertFromScalableVector(
26763          DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
26764    }
26765  }
26766
26767  // Avoid producing TBL instruction if we don't know SVE register minimal size,
26768  // unless NEON is not available and we can assume minimal SVE register size is
26769  // 128-bits.
26770  if (MinSVESize || !Subtarget->isNeonAvailable())
26771    return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT,
26772                                     DAG);
26773
26774  return SDValue();
26775}
26776
26777SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
26778                                                 SelectionDAG &DAG) const {
26779  SDLoc DL(Op);
26780  EVT InVT = Op.getValueType();
26781
26782  assert(VT.isScalableVector() && isTypeLegal(VT) &&
26783         InVT.isScalableVector() && isTypeLegal(InVT) &&
26784         "Only expect to cast between legal scalable vector types!");
26785  assert(VT.getVectorElementType() != MVT::i1 &&
26786         InVT.getVectorElementType() != MVT::i1 &&
26787         "For predicate bitcasts, use getSVEPredicateBitCast");
26788
26789  if (InVT == VT)
26790    return Op;
26791
26792  EVT PackedVT = getPackedSVEVectorVT(VT.getVectorElementType());
26793  EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
26794
26795  // Safe bitcasting between unpacked vector types of different element counts
26796  // is currently unsupported because the following is missing the necessary
26797  // work to ensure the result's elements live where they're supposed to within
26798  // an SVE register.
26799  //                01234567
26800  // e.g. nxv2i32 = XX??XX??
26801  //      nxv4f16 = X?X?X?X?
26802  assert((VT.getVectorElementCount() == InVT.getVectorElementCount() ||
26803          VT == PackedVT || InVT == PackedInVT) &&
26804         "Unexpected bitcast!");
26805
26806  // Pack input if required.
26807  if (InVT != PackedInVT)
26808    Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
26809
26810  Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
26811
26812  // Unpack result if required.
26813  if (VT != PackedVT)
26814    Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
26815
26816  return Op;
26817}
26818
26819bool AArch64TargetLowering::isAllActivePredicate(SelectionDAG &DAG,
26820                                                 SDValue N) const {
26821  return ::isAllActivePredicate(DAG, N);
26822}
26823
26824EVT AArch64TargetLowering::getPromotedVTForPredicate(EVT VT) const {
26825  return ::getPromotedVTForPredicate(VT);
26826}
26827
26828bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
26829    SDValue Op, const APInt &OriginalDemandedBits,
26830    const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
26831    unsigned Depth) const {
26832
26833  unsigned Opc = Op.getOpcode();
26834  switch (Opc) {
26835  case AArch64ISD::VSHL: {
26836    // Match (VSHL (VLSHR Val X) X)
26837    SDValue ShiftL = Op;
26838    SDValue ShiftR = Op->getOperand(0);
26839    if (ShiftR->getOpcode() != AArch64ISD::VLSHR)
26840      return false;
26841
26842    if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())
26843      return false;
26844
26845    unsigned ShiftLBits = ShiftL->getConstantOperandVal(1);
26846    unsigned ShiftRBits = ShiftR->getConstantOperandVal(1);
26847
26848    // Other cases can be handled as well, but this is not
26849    // implemented.
26850    if (ShiftRBits != ShiftLBits)
26851      return false;
26852
26853    unsigned ScalarSize = Op.getScalarValueSizeInBits();
26854    assert(ScalarSize > ShiftLBits && "Invalid shift imm");
26855
26856    APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits);
26857    APInt UnusedBits = ~OriginalDemandedBits;
26858
26859    if ((ZeroBits & UnusedBits) != ZeroBits)
26860      return false;
26861
26862    // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
26863    // used - simplify to just Val.
26864    return TLO.CombineTo(Op, ShiftR->getOperand(0));
26865  }
26866  case ISD::INTRINSIC_WO_CHAIN: {
26867    if (auto ElementSize = IsSVECntIntrinsic(Op)) {
26868      unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
26869      if (!MaxSVEVectorSizeInBits)
26870        MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
26871      unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize;
26872      // The SVE count intrinsics don't support the multiplier immediate so we
26873      // don't have to account for that here. The value returned may be slightly
26874      // over the true required bits, as this is based on the "ALL" pattern. The
26875      // other patterns are also exposed by these intrinsics, but they all
26876      // return a value that's strictly less than "ALL".
26877      unsigned RequiredBits = llvm::bit_width(MaxElements);
26878      unsigned BitWidth = Known.Zero.getBitWidth();
26879      if (RequiredBits < BitWidth)
26880        Known.Zero.setHighBits(BitWidth - RequiredBits);
26881      return false;
26882    }
26883  }
26884  }
26885
26886  return TargetLowering::SimplifyDemandedBitsForTargetNode(
26887      Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
26888}
26889
26890bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
26891  return Op.getOpcode() == AArch64ISD::DUP ||
26892         Op.getOpcode() == AArch64ISD::MOVI ||
26893         (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
26894          Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||
26895         TargetLowering::isTargetCanonicalConstantNode(Op);
26896}
26897
26898bool AArch64TargetLowering::isComplexDeinterleavingSupported() const {
26899  return Subtarget->hasSVE() || Subtarget->hasSVE2() ||
26900         Subtarget->hasComplxNum();
26901}
26902
26903bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported(
26904    ComplexDeinterleavingOperation Operation, Type *Ty) const {
26905  auto *VTy = dyn_cast<VectorType>(Ty);
26906  if (!VTy)
26907    return false;
26908
26909  // If the vector is scalable, SVE is enabled, implying support for complex
26910  // numbers. Otherwise, we need to ensure complex number support is available
26911  if (!VTy->isScalableTy() && !Subtarget->hasComplxNum())
26912    return false;
26913
26914  auto *ScalarTy = VTy->getScalarType();
26915  unsigned NumElements = VTy->getElementCount().getKnownMinValue();
26916
26917  // We can only process vectors that have a bit size of 128 or higher (with an
26918  // additional 64 bits for Neon). Additionally, these vectors must have a
26919  // power-of-2 size, as we later split them into the smallest supported size
26920  // and merging them back together after applying complex operation.
26921  unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
26922  if ((VTyWidth < 128 && (VTy->isScalableTy() || VTyWidth != 64)) ||
26923      !llvm::isPowerOf2_32(VTyWidth))
26924    return false;
26925
26926  if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {
26927    unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
26928    return 8 <= ScalarWidth && ScalarWidth <= 64;
26929  }
26930
26931  return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||
26932         ScalarTy->isFloatTy() || ScalarTy->isDoubleTy();
26933}
26934
26935Value *AArch64TargetLowering::createComplexDeinterleavingIR(
26936    IRBuilderBase &B, ComplexDeinterleavingOperation OperationType,
26937    ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
26938    Value *Accumulator) const {
26939  VectorType *Ty = cast<VectorType>(InputA->getType());
26940  bool IsScalable = Ty->isScalableTy();
26941  bool IsInt = Ty->getElementType()->isIntegerTy();
26942
26943  unsigned TyWidth =
26944      Ty->getScalarSizeInBits() * Ty->getElementCount().getKnownMinValue();
26945
26946  assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) &&
26947         "Vector type must be either 64 or a power of 2 that is at least 128");
26948
26949  if (TyWidth > 128) {
26950    int Stride = Ty->getElementCount().getKnownMinValue() / 2;
26951    auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);
26952    auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, B.getInt64(0));
26953    auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, B.getInt64(0));
26954    auto *UpperSplitA =
26955        B.CreateExtractVector(HalfTy, InputA, B.getInt64(Stride));
26956    auto *UpperSplitB =
26957        B.CreateExtractVector(HalfTy, InputB, B.getInt64(Stride));
26958    Value *LowerSplitAcc = nullptr;
26959    Value *UpperSplitAcc = nullptr;
26960    if (Accumulator) {
26961      LowerSplitAcc = B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(0));
26962      UpperSplitAcc =
26963          B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(Stride));
26964    }
26965    auto *LowerSplitInt = createComplexDeinterleavingIR(
26966        B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
26967    auto *UpperSplitInt = createComplexDeinterleavingIR(
26968        B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
26969
26970    auto *Result = B.CreateInsertVector(Ty, PoisonValue::get(Ty), LowerSplitInt,
26971                                        B.getInt64(0));
26972    return B.CreateInsertVector(Ty, Result, UpperSplitInt, B.getInt64(Stride));
26973  }
26974
26975  if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
26976    if (Accumulator == nullptr)
26977      Accumulator = Constant::getNullValue(Ty);
26978
26979    if (IsScalable) {
26980      if (IsInt)
26981        return B.CreateIntrinsic(
26982            Intrinsic::aarch64_sve_cmla_x, Ty,
26983            {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
26984
26985      auto *Mask = B.getAllOnesMask(Ty->getElementCount());
26986      return B.CreateIntrinsic(
26987          Intrinsic::aarch64_sve_fcmla, Ty,
26988          {Mask, Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
26989    }
26990
26991    Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0,
26992                              Intrinsic::aarch64_neon_vcmla_rot90,
26993                              Intrinsic::aarch64_neon_vcmla_rot180,
26994                              Intrinsic::aarch64_neon_vcmla_rot270};
26995
26996
26997    return B.CreateIntrinsic(IdMap[(int)Rotation], Ty,
26998                             {Accumulator, InputA, InputB});
26999  }
27000
27001  if (OperationType == ComplexDeinterleavingOperation::CAdd) {
27002    if (IsScalable) {
27003      if (Rotation == ComplexDeinterleavingRotation::Rotation_90 ||
27004          Rotation == ComplexDeinterleavingRotation::Rotation_270) {
27005        if (IsInt)
27006          return B.CreateIntrinsic(
27007              Intrinsic::aarch64_sve_cadd_x, Ty,
27008              {InputA, InputB, B.getInt32((int)Rotation * 90)});
27009
27010        auto *Mask = B.getAllOnesMask(Ty->getElementCount());
27011        return B.CreateIntrinsic(
27012            Intrinsic::aarch64_sve_fcadd, Ty,
27013            {Mask, InputA, InputB, B.getInt32((int)Rotation * 90)});
27014      }
27015      return nullptr;
27016    }
27017
27018    Intrinsic::ID IntId = Intrinsic::not_intrinsic;
27019    if (Rotation == ComplexDeinterleavingRotation::Rotation_90)
27020      IntId = Intrinsic::aarch64_neon_vcadd_rot90;
27021    else if (Rotation == ComplexDeinterleavingRotation::Rotation_270)
27022      IntId = Intrinsic::aarch64_neon_vcadd_rot270;
27023
27024    if (IntId == Intrinsic::not_intrinsic)
27025      return nullptr;
27026
27027    return B.CreateIntrinsic(IntId, Ty, {InputA, InputB});
27028  }
27029
27030  return nullptr;
27031}
27032
27033bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
27034  unsigned Opc = N->getOpcode();
27035  if (ISD::isExtOpcode(Opc)) {
27036    if (any_of(N->uses(),
27037               [&](SDNode *Use) { return Use->getOpcode() == ISD::MUL; }))
27038      return false;
27039  }
27040  return true;
27041}
27042
27043unsigned AArch64TargetLowering::getMinimumJumpTableEntries() const {
27044  return Subtarget->getMinimumJumpTableEntries();
27045}
27046
27047MVT AArch64TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
27048                                                         CallingConv::ID CC,
27049                                                         EVT VT) const {
27050  bool NonUnitFixedLengthVector =
27051      VT.isFixedLengthVector() && !VT.getVectorElementCount().isScalar();
27052  if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
27053    return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
27054
27055  EVT VT1;
27056  MVT RegisterVT;
27057  unsigned NumIntermediates;
27058  getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1, NumIntermediates,
27059                                       RegisterVT);
27060  return RegisterVT;
27061}
27062
27063unsigned AArch64TargetLowering::getNumRegistersForCallingConv(
27064    LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
27065  bool NonUnitFixedLengthVector =
27066      VT.isFixedLengthVector() && !VT.getVectorElementCount().isScalar();
27067  if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
27068    return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
27069
27070  EVT VT1;
27071  MVT VT2;
27072  unsigned NumIntermediates;
27073  return getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1,
27074                                              NumIntermediates, VT2);
27075}
27076
27077unsigned AArch64TargetLowering::getVectorTypeBreakdownForCallingConv(
27078    LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
27079    unsigned &NumIntermediates, MVT &RegisterVT) const {
27080  int NumRegs = TargetLowering::getVectorTypeBreakdownForCallingConv(
27081      Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
27082  if (!RegisterVT.isFixedLengthVector() ||
27083      RegisterVT.getFixedSizeInBits() <= 128)
27084    return NumRegs;
27085
27086  assert(Subtarget->useSVEForFixedLengthVectors() && "Unexpected mode!");
27087  assert(IntermediateVT == RegisterVT && "Unexpected VT mismatch!");
27088  assert(RegisterVT.getFixedSizeInBits() % 128 == 0 && "Unexpected size!");
27089
27090  // A size mismatch here implies either type promotion or widening and would
27091  // have resulted in scalarisation if larger vectors had not be available.
27092  if (RegisterVT.getSizeInBits() * NumRegs != VT.getSizeInBits()) {
27093    EVT EltTy = VT.getVectorElementType();
27094    EVT NewVT = EVT::getVectorVT(Context, EltTy, ElementCount::getFixed(1));
27095    if (!isTypeLegal(NewVT))
27096      NewVT = EltTy;
27097
27098    IntermediateVT = NewVT;
27099    NumIntermediates = VT.getVectorNumElements();
27100    RegisterVT = getRegisterType(Context, NewVT);
27101    return NumIntermediates;
27102  }
27103
27104  // SVE VLS support does not introduce a new ABI so we should use NEON sized
27105  // types for vector arguments and returns.
27106
27107  unsigned NumSubRegs = RegisterVT.getFixedSizeInBits() / 128;
27108  NumIntermediates *= NumSubRegs;
27109  NumRegs *= NumSubRegs;
27110
27111  switch (RegisterVT.getVectorElementType().SimpleTy) {
27112  default:
27113    llvm_unreachable("unexpected element type for vector");
27114  case MVT::i8:
27115    IntermediateVT = RegisterVT = MVT::v16i8;
27116    break;
27117  case MVT::i16:
27118    IntermediateVT = RegisterVT = MVT::v8i16;
27119    break;
27120  case MVT::i32:
27121    IntermediateVT = RegisterVT = MVT::v4i32;
27122    break;
27123  case MVT::i64:
27124    IntermediateVT = RegisterVT = MVT::v2i64;
27125    break;
27126  case MVT::f16:
27127    IntermediateVT = RegisterVT = MVT::v8f16;
27128    break;
27129  case MVT::f32:
27130    IntermediateVT = RegisterVT = MVT::v4f32;
27131    break;
27132  case MVT::f64:
27133    IntermediateVT = RegisterVT = MVT::v2f64;
27134    break;
27135  case MVT::bf16:
27136    IntermediateVT = RegisterVT = MVT::v8bf16;
27137    break;
27138  }
27139
27140  return NumRegs;
27141}
27142
27143bool AArch64TargetLowering::hasInlineStackProbe(
27144    const MachineFunction &MF) const {
27145  return !Subtarget->isTargetWindows() &&
27146         MF.getInfo<AArch64FunctionInfo>()->hasStackProbing();
27147}
27148