1//===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file This pass tries to apply several peephole SDWA patterns.
10///
11/// E.g. original:
12///   V_LSHRREV_B32_e32 %0, 16, %1
13///   V_ADD_CO_U32_e32 %2, %0, %3
14///   V_LSHLREV_B32_e32 %4, 16, %2
15///
16/// Replace:
17///   V_ADD_CO_U32_sdwa %4, %1, %3
18///       dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
19///
20//===----------------------------------------------------------------------===//
21
22#include "AMDGPU.h"
23#include "GCNSubtarget.h"
24#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
25#include "llvm/ADT/MapVector.h"
26#include "llvm/ADT/Statistic.h"
27#include "llvm/CodeGen/MachineFunctionPass.h"
28#include <optional>
29
30using namespace llvm;
31
32#define DEBUG_TYPE "si-peephole-sdwa"
33
34STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found.");
35STATISTIC(NumSDWAInstructionsPeepholed,
36          "Number of instruction converted to SDWA.");
37
38namespace {
39
40class SDWAOperand;
41class SDWADstOperand;
42
43class SIPeepholeSDWA : public MachineFunctionPass {
44public:
45  using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
46
47private:
48  MachineRegisterInfo *MRI;
49  const SIRegisterInfo *TRI;
50  const SIInstrInfo *TII;
51
52  MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
53  MapVector<MachineInstr *, SDWAOperandsVector> PotentialMatches;
54  SmallVector<MachineInstr *, 8> ConvertedInstructions;
55
56  std::optional<int64_t> foldToImm(const MachineOperand &Op) const;
57
58public:
59  static char ID;
60
61  SIPeepholeSDWA() : MachineFunctionPass(ID) {
62    initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry());
63  }
64
65  bool runOnMachineFunction(MachineFunction &MF) override;
66  void matchSDWAOperands(MachineBasicBlock &MBB);
67  std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
68  bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const;
69  void pseudoOpConvertToVOP2(MachineInstr &MI,
70                             const GCNSubtarget &ST) const;
71  bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
72  void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;
73
74  StringRef getPassName() const override { return "SI Peephole SDWA"; }
75
76  void getAnalysisUsage(AnalysisUsage &AU) const override {
77    AU.setPreservesCFG();
78    MachineFunctionPass::getAnalysisUsage(AU);
79  }
80};
81
82class SDWAOperand {
83private:
84  MachineOperand *Target; // Operand that would be used in converted instruction
85  MachineOperand *Replaced; // Operand that would be replace by Target
86
87public:
88  SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp)
89      : Target(TargetOp), Replaced(ReplacedOp) {
90    assert(Target->isReg());
91    assert(Replaced->isReg());
92  }
93
94  virtual ~SDWAOperand() = default;
95
96  virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0;
97  virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0;
98
99  MachineOperand *getTargetOperand() const { return Target; }
100  MachineOperand *getReplacedOperand() const { return Replaced; }
101  MachineInstr *getParentInst() const { return Target->getParent(); }
102
103  MachineRegisterInfo *getMRI() const {
104    return &getParentInst()->getParent()->getParent()->getRegInfo();
105  }
106
107#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
108  virtual void print(raw_ostream& OS) const = 0;
109  void dump() const { print(dbgs()); }
110#endif
111};
112
113using namespace AMDGPU::SDWA;
114
115class SDWASrcOperand : public SDWAOperand {
116private:
117  SdwaSel SrcSel;
118  bool Abs;
119  bool Neg;
120  bool Sext;
121
122public:
123  SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
124                 SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false,
125                 bool Sext_ = false)
126      : SDWAOperand(TargetOp, ReplacedOp),
127        SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {}
128
129  MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
130  bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
131
132  SdwaSel getSrcSel() const { return SrcSel; }
133  bool getAbs() const { return Abs; }
134  bool getNeg() const { return Neg; }
135  bool getSext() const { return Sext; }
136
137  uint64_t getSrcMods(const SIInstrInfo *TII,
138                      const MachineOperand *SrcOp) const;
139
140#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
141  void print(raw_ostream& OS) const override;
142#endif
143};
144
145class SDWADstOperand : public SDWAOperand {
146private:
147  SdwaSel DstSel;
148  DstUnused DstUn;
149
150public:
151
152  SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
153                 SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD)
154    : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
155
156  MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
157  bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
158
159  SdwaSel getDstSel() const { return DstSel; }
160  DstUnused getDstUnused() const { return DstUn; }
161
162#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
163  void print(raw_ostream& OS) const override;
164#endif
165};
166
167class SDWADstPreserveOperand : public SDWADstOperand {
168private:
169  MachineOperand *Preserve;
170
171public:
172  SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
173                         MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD)
174      : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE),
175        Preserve(PreserveOp) {}
176
177  bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
178
179  MachineOperand *getPreservedOperand() const { return Preserve; }
180
181#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
182  void print(raw_ostream& OS) const override;
183#endif
184};
185
186} // end anonymous namespace
187
188INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false)
189
190char SIPeepholeSDWA::ID = 0;
191
192char &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID;
193
194FunctionPass *llvm::createSIPeepholeSDWAPass() {
195  return new SIPeepholeSDWA();
196}
197
198
199#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
200static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) {
201  switch(Sel) {
202  case BYTE_0: OS << "BYTE_0"; break;
203  case BYTE_1: OS << "BYTE_1"; break;
204  case BYTE_2: OS << "BYTE_2"; break;
205  case BYTE_3: OS << "BYTE_3"; break;
206  case WORD_0: OS << "WORD_0"; break;
207  case WORD_1: OS << "WORD_1"; break;
208  case DWORD:  OS << "DWORD"; break;
209  }
210  return OS;
211}
212
213static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) {
214  switch(Un) {
215  case UNUSED_PAD: OS << "UNUSED_PAD"; break;
216  case UNUSED_SEXT: OS << "UNUSED_SEXT"; break;
217  case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break;
218  }
219  return OS;
220}
221
222LLVM_DUMP_METHOD
223void SDWASrcOperand::print(raw_ostream& OS) const {
224  OS << "SDWA src: " << *getTargetOperand()
225    << " src_sel:" << getSrcSel()
226    << " abs:" << getAbs() << " neg:" << getNeg()
227    << " sext:" << getSext() << '\n';
228}
229
230LLVM_DUMP_METHOD
231void SDWADstOperand::print(raw_ostream& OS) const {
232  OS << "SDWA dst: " << *getTargetOperand()
233    << " dst_sel:" << getDstSel()
234    << " dst_unused:" << getDstUnused() << '\n';
235}
236
237LLVM_DUMP_METHOD
238void SDWADstPreserveOperand::print(raw_ostream& OS) const {
239  OS << "SDWA preserve dst: " << *getTargetOperand()
240    << " dst_sel:" << getDstSel()
241    << " preserve:" << *getPreservedOperand() << '\n';
242}
243
244#endif
245
246static void copyRegOperand(MachineOperand &To, const MachineOperand &From) {
247  assert(To.isReg() && From.isReg());
248  To.setReg(From.getReg());
249  To.setSubReg(From.getSubReg());
250  To.setIsUndef(From.isUndef());
251  if (To.isUse()) {
252    To.setIsKill(From.isKill());
253  } else {
254    To.setIsDead(From.isDead());
255  }
256}
257
258static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) {
259  return LHS.isReg() &&
260         RHS.isReg() &&
261         LHS.getReg() == RHS.getReg() &&
262         LHS.getSubReg() == RHS.getSubReg();
263}
264
265static MachineOperand *findSingleRegUse(const MachineOperand *Reg,
266                                        const MachineRegisterInfo *MRI) {
267  if (!Reg->isReg() || !Reg->isDef())
268    return nullptr;
269
270  MachineOperand *ResMO = nullptr;
271  for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg->getReg())) {
272    // If there exist use of subreg of Reg then return nullptr
273    if (!isSameReg(UseMO, *Reg))
274      return nullptr;
275
276    // Check that there is only one instruction that uses Reg
277    if (!ResMO) {
278      ResMO = &UseMO;
279    } else if (ResMO->getParent() != UseMO.getParent()) {
280      return nullptr;
281    }
282  }
283
284  return ResMO;
285}
286
287static MachineOperand *findSingleRegDef(const MachineOperand *Reg,
288                                        const MachineRegisterInfo *MRI) {
289  if (!Reg->isReg())
290    return nullptr;
291
292  MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg->getReg());
293  if (!DefInstr)
294    return nullptr;
295
296  for (auto &DefMO : DefInstr->defs()) {
297    if (DefMO.isReg() && DefMO.getReg() == Reg->getReg())
298      return &DefMO;
299  }
300
301  // Ignore implicit defs.
302  return nullptr;
303}
304
305uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
306                                    const MachineOperand *SrcOp) const {
307  uint64_t Mods = 0;
308  const auto *MI = SrcOp->getParent();
309  if (TII->getNamedOperand(*MI, AMDGPU::OpName::src0) == SrcOp) {
310    if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) {
311      Mods = Mod->getImm();
312    }
313  } else if (TII->getNamedOperand(*MI, AMDGPU::OpName::src1) == SrcOp) {
314    if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers)) {
315      Mods = Mod->getImm();
316    }
317  }
318  if (Abs || Neg) {
319    assert(!Sext &&
320           "Float and integer src modifiers can't be set simultaneously");
321    Mods |= Abs ? SISrcMods::ABS : 0u;
322    Mods ^= Neg ? SISrcMods::NEG : 0u;
323  } else if (Sext) {
324    Mods |= SISrcMods::SEXT;
325  }
326
327  return Mods;
328}
329
330MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) {
331  // For SDWA src operand potential instruction is one that use register
332  // defined by parent instruction
333  MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI());
334  if (!PotentialMO)
335    return nullptr;
336
337  return PotentialMO->getParent();
338}
339
340bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
341  // Find operand in instruction that matches source operand and replace it with
342  // target operand. Set corresponding src_sel
343  bool IsPreserveSrc = false;
344  MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
345  MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
346  MachineOperand *SrcMods =
347      TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
348  assert(Src && (Src->isReg() || Src->isImm()));
349  if (!isSameReg(*Src, *getReplacedOperand())) {
350    // If this is not src0 then it could be src1
351    Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
352    SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
353    SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
354
355    if (!Src ||
356        !isSameReg(*Src, *getReplacedOperand())) {
357      // It's possible this Src is a tied operand for
358      // UNUSED_PRESERVE, in which case we can either
359      // abandon the peephole attempt, or if legal we can
360      // copy the target operand into the tied slot
361      // if the preserve operation will effectively cause the same
362      // result by overwriting the rest of the dst.
363      MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
364      MachineOperand *DstUnused =
365        TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
366
367      if (Dst &&
368          DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
369        // This will work if the tied src is accessing WORD_0, and the dst is
370        // writing WORD_1. Modifiers don't matter because all the bits that
371        // would be impacted are being overwritten by the dst.
372        // Any other case will not work.
373        SdwaSel DstSel = static_cast<SdwaSel>(
374            TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel));
375        if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 &&
376            getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) {
377          IsPreserveSrc = true;
378          auto DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
379                                                   AMDGPU::OpName::vdst);
380          auto TiedIdx = MI.findTiedOperandIdx(DstIdx);
381          Src = &MI.getOperand(TiedIdx);
382          SrcSel = nullptr;
383          SrcMods = nullptr;
384        } else {
385          // Not legal to convert this src
386          return false;
387        }
388      }
389    }
390    assert(Src && Src->isReg());
391
392    if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
393         MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
394         MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
395         MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
396         !isSameReg(*Src, *getReplacedOperand())) {
397      // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to
398      // src2. This is not allowed.
399      return false;
400    }
401
402    assert(isSameReg(*Src, *getReplacedOperand()) &&
403           (IsPreserveSrc || (SrcSel && SrcMods)));
404  }
405  copyRegOperand(*Src, *getTargetOperand());
406  if (!IsPreserveSrc) {
407    SrcSel->setImm(getSrcSel());
408    SrcMods->setImm(getSrcMods(TII, Src));
409  }
410  getTargetOperand()->setIsKill(false);
411  return true;
412}
413
414MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) {
415  // For SDWA dst operand potential instruction is one that defines register
416  // that this operand uses
417  MachineRegisterInfo *MRI = getMRI();
418  MachineInstr *ParentMI = getParentInst();
419
420  MachineOperand *PotentialMO = findSingleRegDef(getReplacedOperand(), MRI);
421  if (!PotentialMO)
422    return nullptr;
423
424  // Check that ParentMI is the only instruction that uses replaced register
425  for (MachineInstr &UseInst : MRI->use_nodbg_instructions(PotentialMO->getReg())) {
426    if (&UseInst != ParentMI)
427      return nullptr;
428  }
429
430  return PotentialMO->getParent();
431}
432
433bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
434  // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused
435
436  if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
437       MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
438       MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
439       MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
440      getDstSel() != AMDGPU::SDWA::DWORD) {
441    // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD
442    return false;
443  }
444
445  MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
446  assert(Operand &&
447         Operand->isReg() &&
448         isSameReg(*Operand, *getReplacedOperand()));
449  copyRegOperand(*Operand, *getTargetOperand());
450  MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);
451  assert(DstSel);
452  DstSel->setImm(getDstSel());
453  MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
454  assert(DstUnused);
455  DstUnused->setImm(getDstUnused());
456
457  // Remove original instruction  because it would conflict with our new
458  // instruction by register definition
459  getParentInst()->eraseFromParent();
460  return true;
461}
462
463bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI,
464                                           const SIInstrInfo *TII) {
465  // MI should be moved right before v_or_b32.
466  // For this we should clear all kill flags on uses of MI src-operands or else
467  // we can encounter problem with use of killed operand.
468  for (MachineOperand &MO : MI.uses()) {
469    if (!MO.isReg())
470      continue;
471    getMRI()->clearKillFlags(MO.getReg());
472  }
473
474  // Move MI before v_or_b32
475  auto MBB = MI.getParent();
476  MBB->remove(&MI);
477  MBB->insert(getParentInst(), &MI);
478
479  // Add Implicit use of preserved register
480  MachineInstrBuilder MIB(*MBB->getParent(), MI);
481  MIB.addReg(getPreservedOperand()->getReg(),
482             RegState::ImplicitKill,
483             getPreservedOperand()->getSubReg());
484
485  // Tie dst to implicit use
486  MI.tieOperands(AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst),
487                 MI.getNumOperands() - 1);
488
489  // Convert MI as any other SDWADstOperand and remove v_or_b32
490  return SDWADstOperand::convertToSDWA(MI, TII);
491}
492
493std::optional<int64_t>
494SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const {
495  if (Op.isImm()) {
496    return Op.getImm();
497  }
498
499  // If this is not immediate then it can be copy of immediate value, e.g.:
500  // %1 = S_MOV_B32 255;
501  if (Op.isReg()) {
502    for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) {
503      if (!isSameReg(Op, Def))
504        continue;
505
506      const MachineInstr *DefInst = Def.getParent();
507      if (!TII->isFoldableCopy(*DefInst))
508        return std::nullopt;
509
510      const MachineOperand &Copied = DefInst->getOperand(1);
511      if (!Copied.isImm())
512        return std::nullopt;
513
514      return Copied.getImm();
515    }
516  }
517
518  return std::nullopt;
519}
520
521std::unique_ptr<SDWAOperand>
522SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
523  unsigned Opcode = MI.getOpcode();
524  switch (Opcode) {
525  case AMDGPU::V_LSHRREV_B32_e32:
526  case AMDGPU::V_ASHRREV_I32_e32:
527  case AMDGPU::V_LSHLREV_B32_e32:
528  case AMDGPU::V_LSHRREV_B32_e64:
529  case AMDGPU::V_ASHRREV_I32_e64:
530  case AMDGPU::V_LSHLREV_B32_e64: {
531    // from: v_lshrrev_b32_e32 v1, 16/24, v0
532    // to SDWA src:v0 src_sel:WORD_1/BYTE_3
533
534    // from: v_ashrrev_i32_e32 v1, 16/24, v0
535    // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1
536
537    // from: v_lshlrev_b32_e32 v1, 16/24, v0
538    // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD
539    MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
540    auto Imm = foldToImm(*Src0);
541    if (!Imm)
542      break;
543
544    if (*Imm != 16 && *Imm != 24)
545      break;
546
547    MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
548    MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
549    if (!Src1->isReg() || Src1->getReg().isPhysical() ||
550        Dst->getReg().isPhysical())
551      break;
552
553    if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
554        Opcode == AMDGPU::V_LSHLREV_B32_e64) {
555      return std::make_unique<SDWADstOperand>(
556          Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD);
557    } else {
558      return std::make_unique<SDWASrcOperand>(
559          Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false,
560          Opcode != AMDGPU::V_LSHRREV_B32_e32 &&
561          Opcode != AMDGPU::V_LSHRREV_B32_e64);
562    }
563    break;
564  }
565
566  case AMDGPU::V_LSHRREV_B16_e32:
567  case AMDGPU::V_ASHRREV_I16_e32:
568  case AMDGPU::V_LSHLREV_B16_e32:
569  case AMDGPU::V_LSHRREV_B16_e64:
570  case AMDGPU::V_ASHRREV_I16_e64:
571  case AMDGPU::V_LSHLREV_B16_e64: {
572    // from: v_lshrrev_b16_e32 v1, 8, v0
573    // to SDWA src:v0 src_sel:BYTE_1
574
575    // from: v_ashrrev_i16_e32 v1, 8, v0
576    // to SDWA src:v0 src_sel:BYTE_1 sext:1
577
578    // from: v_lshlrev_b16_e32 v1, 8, v0
579    // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD
580    MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
581    auto Imm = foldToImm(*Src0);
582    if (!Imm || *Imm != 8)
583      break;
584
585    MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
586    MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
587
588    if (!Src1->isReg() || Src1->getReg().isPhysical() ||
589        Dst->getReg().isPhysical())
590      break;
591
592    if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
593        Opcode == AMDGPU::V_LSHLREV_B16_e64) {
594      return std::make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
595    } else {
596      return std::make_unique<SDWASrcOperand>(
597            Src1, Dst, BYTE_1, false, false,
598            Opcode != AMDGPU::V_LSHRREV_B16_e32 &&
599            Opcode != AMDGPU::V_LSHRREV_B16_e64);
600    }
601    break;
602  }
603
604  case AMDGPU::V_BFE_I32_e64:
605  case AMDGPU::V_BFE_U32_e64: {
606    // e.g.:
607    // from: v_bfe_u32 v1, v0, 8, 8
608    // to SDWA src:v0 src_sel:BYTE_1
609
610    // offset | width | src_sel
611    // ------------------------
612    // 0      | 8     | BYTE_0
613    // 0      | 16    | WORD_0
614    // 0      | 32    | DWORD ?
615    // 8      | 8     | BYTE_1
616    // 16     | 8     | BYTE_2
617    // 16     | 16    | WORD_1
618    // 24     | 8     | BYTE_3
619
620    MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
621    auto Offset = foldToImm(*Src1);
622    if (!Offset)
623      break;
624
625    MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
626    auto Width = foldToImm(*Src2);
627    if (!Width)
628      break;
629
630    SdwaSel SrcSel = DWORD;
631
632    if (*Offset == 0 && *Width == 8)
633      SrcSel = BYTE_0;
634    else if (*Offset == 0 && *Width == 16)
635      SrcSel = WORD_0;
636    else if (*Offset == 0 && *Width == 32)
637      SrcSel = DWORD;
638    else if (*Offset == 8 && *Width == 8)
639      SrcSel = BYTE_1;
640    else if (*Offset == 16 && *Width == 8)
641      SrcSel = BYTE_2;
642    else if (*Offset == 16 && *Width == 16)
643      SrcSel = WORD_1;
644    else if (*Offset == 24 && *Width == 8)
645      SrcSel = BYTE_3;
646    else
647      break;
648
649    MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
650    MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
651
652    if (!Src0->isReg() || Src0->getReg().isPhysical() ||
653        Dst->getReg().isPhysical())
654      break;
655
656    return std::make_unique<SDWASrcOperand>(
657          Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32_e64);
658  }
659
660  case AMDGPU::V_AND_B32_e32:
661  case AMDGPU::V_AND_B32_e64: {
662    // e.g.:
663    // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0
664    // to SDWA src:v0 src_sel:WORD_0/BYTE_0
665
666    MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
667    MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
668    auto ValSrc = Src1;
669    auto Imm = foldToImm(*Src0);
670
671    if (!Imm) {
672      Imm = foldToImm(*Src1);
673      ValSrc = Src0;
674    }
675
676    if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff))
677      break;
678
679    MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
680
681    if (!ValSrc->isReg() || ValSrc->getReg().isPhysical() ||
682        Dst->getReg().isPhysical())
683      break;
684
685    return std::make_unique<SDWASrcOperand>(
686        ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0);
687  }
688
689  case AMDGPU::V_OR_B32_e32:
690  case AMDGPU::V_OR_B32_e64: {
691    // Patterns for dst_unused:UNUSED_PRESERVE.
692    // e.g., from:
693    // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD
694    //                           src1_sel:WORD_1 src2_sel:WORD1
695    // v_add_f16_e32 v3, v1, v2
696    // v_or_b32_e32 v4, v0, v3
697    // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3
698
699    // Check if one of operands of v_or_b32 is SDWA instruction
700    using CheckRetType =
701        std::optional<std::pair<MachineOperand *, MachineOperand *>>;
702    auto CheckOROperandsForSDWA =
703      [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType {
704        if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg())
705          return CheckRetType(std::nullopt);
706
707        MachineOperand *Op1Def = findSingleRegDef(Op1, MRI);
708        if (!Op1Def)
709          return CheckRetType(std::nullopt);
710
711        MachineInstr *Op1Inst = Op1Def->getParent();
712        if (!TII->isSDWA(*Op1Inst))
713          return CheckRetType(std::nullopt);
714
715        MachineOperand *Op2Def = findSingleRegDef(Op2, MRI);
716        if (!Op2Def)
717          return CheckRetType(std::nullopt);
718
719        return CheckRetType(std::pair(Op1Def, Op2Def));
720      };
721
722    MachineOperand *OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
723    MachineOperand *OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
724    assert(OrSDWA && OrOther);
725    auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
726    if (!Res) {
727      OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
728      OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
729      assert(OrSDWA && OrOther);
730      Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
731      if (!Res)
732        break;
733    }
734
735    MachineOperand *OrSDWADef = Res->first;
736    MachineOperand *OrOtherDef = Res->second;
737    assert(OrSDWADef && OrOtherDef);
738
739    MachineInstr *SDWAInst = OrSDWADef->getParent();
740    MachineInstr *OtherInst = OrOtherDef->getParent();
741
742    // Check that OtherInstr is actually bitwise compatible with SDWAInst = their
743    // destination patterns don't overlap. Compatible instruction can be either
744    // regular instruction with compatible bitness or SDWA instruction with
745    // correct dst_sel
746    // SDWAInst | OtherInst bitness / OtherInst dst_sel
747    // -----------------------------------------------------
748    // DWORD    | no                    / no
749    // WORD_0   | no                    / BYTE_2/3, WORD_1
750    // WORD_1   | 8/16-bit instructions / BYTE_0/1, WORD_0
751    // BYTE_0   | no                    / BYTE_1/2/3, WORD_1
752    // BYTE_1   | 8-bit                 / BYTE_0/2/3, WORD_1
753    // BYTE_2   | 8/16-bit              / BYTE_0/1/3. WORD_0
754    // BYTE_3   | 8/16/24-bit           / BYTE_0/1/2, WORD_0
755    // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK
756    // but v_add_f32 is not.
757
758    // TODO: add support for non-SDWA instructions as OtherInst.
759    // For now this only works with SDWA instructions. For regular instructions
760    // there is no way to determine if the instruction writes only 8/16/24-bit
761    // out of full register size and all registers are at min 32-bit wide.
762    if (!TII->isSDWA(*OtherInst))
763      break;
764
765    SdwaSel DstSel = static_cast<SdwaSel>(
766        TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel));
767    SdwaSel OtherDstSel = static_cast<SdwaSel>(
768      TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel));
769
770    bool DstSelAgree = false;
771    switch (DstSel) {
772    case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) ||
773                                (OtherDstSel == BYTE_3) ||
774                                (OtherDstSel == WORD_1));
775      break;
776    case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) ||
777                                (OtherDstSel == BYTE_1) ||
778                                (OtherDstSel == WORD_0));
779      break;
780    case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) ||
781                                (OtherDstSel == BYTE_2) ||
782                                (OtherDstSel == BYTE_3) ||
783                                (OtherDstSel == WORD_1));
784      break;
785    case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) ||
786                                (OtherDstSel == BYTE_2) ||
787                                (OtherDstSel == BYTE_3) ||
788                                (OtherDstSel == WORD_1));
789      break;
790    case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) ||
791                                (OtherDstSel == BYTE_1) ||
792                                (OtherDstSel == BYTE_3) ||
793                                (OtherDstSel == WORD_0));
794      break;
795    case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) ||
796                                (OtherDstSel == BYTE_1) ||
797                                (OtherDstSel == BYTE_2) ||
798                                (OtherDstSel == WORD_0));
799      break;
800    default: DstSelAgree = false;
801    }
802
803    if (!DstSelAgree)
804      break;
805
806    // Also OtherInst dst_unused should be UNUSED_PAD
807    DstUnused OtherDstUnused = static_cast<DstUnused>(
808      TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_unused));
809    if (OtherDstUnused != DstUnused::UNUSED_PAD)
810      break;
811
812    // Create DstPreserveOperand
813    MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
814    assert(OrDst && OrDst->isReg());
815
816    return std::make_unique<SDWADstPreserveOperand>(
817      OrDst, OrSDWADef, OrOtherDef, DstSel);
818
819  }
820  }
821
822  return std::unique_ptr<SDWAOperand>(nullptr);
823}
824
825#if !defined(NDEBUG)
826static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) {
827  Operand.print(OS);
828  return OS;
829}
830#endif
831
832void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
833  for (MachineInstr &MI : MBB) {
834    if (auto Operand = matchSDWAOperand(MI)) {
835      LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n');
836      SDWAOperands[&MI] = std::move(Operand);
837      ++NumSDWAPatternsFound;
838    }
839  }
840}
841
842// Convert the V_ADD_CO_U32_e64 into V_ADD_CO_U32_e32. This allows
843// isConvertibleToSDWA to perform its transformation on V_ADD_CO_U32_e32 into
844// V_ADD_CO_U32_sdwa.
845//
846// We are transforming from a VOP3 into a VOP2 form of the instruction.
847//   %19:vgpr_32 = V_AND_B32_e32 255,
848//       killed %16:vgpr_32, implicit $exec
849//   %47:vgpr_32, %49:sreg_64_xexec = V_ADD_CO_U32_e64
850//       %26.sub0:vreg_64, %19:vgpr_32, implicit $exec
851//  %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64
852//       %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec
853//
854// becomes
855//   %47:vgpr_32 = V_ADD_CO_U32_sdwa
856//       0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0,
857//       implicit-def $vcc, implicit $exec
858//  %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64
859//       %26.sub1:vreg_64, %54:vgpr_32, killed $vcc, implicit $exec
860void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
861                                           const GCNSubtarget &ST) const {
862  int Opc = MI.getOpcode();
863  assert((Opc == AMDGPU::V_ADD_CO_U32_e64 || Opc == AMDGPU::V_SUB_CO_U32_e64) &&
864         "Currently only handles V_ADD_CO_U32_e64 or V_SUB_CO_U32_e64");
865
866  // Can the candidate MI be shrunk?
867  if (!TII->canShrink(MI, *MRI))
868    return;
869  Opc = AMDGPU::getVOPe32(Opc);
870  // Find the related ADD instruction.
871  const MachineOperand *Sdst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
872  if (!Sdst)
873    return;
874  MachineOperand *NextOp = findSingleRegUse(Sdst, MRI);
875  if (!NextOp)
876    return;
877  MachineInstr &MISucc = *NextOp->getParent();
878
879  // Make sure the carry in/out are subsequently unused.
880  MachineOperand *CarryIn = TII->getNamedOperand(MISucc, AMDGPU::OpName::src2);
881  if (!CarryIn)
882    return;
883  MachineOperand *CarryOut = TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst);
884  if (!CarryOut)
885    return;
886  if (!MRI->hasOneUse(CarryIn->getReg()) || !MRI->use_empty(CarryOut->getReg()))
887    return;
888  // Make sure VCC or its subregs are dead before MI.
889  MachineBasicBlock &MBB = *MI.getParent();
890  auto Liveness = MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25);
891  if (Liveness != MachineBasicBlock::LQR_Dead)
892    return;
893  // Check if VCC is referenced in range of (MI,MISucc].
894  for (auto I = std::next(MI.getIterator()), E = MISucc.getIterator();
895       I != E; ++I) {
896    if (I->modifiesRegister(AMDGPU::VCC, TRI))
897      return;
898  }
899
900  // Replace MI with V_{SUB|ADD}_I32_e32
901  BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc))
902    .add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst))
903    .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0))
904    .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1))
905    .setMIFlags(MI.getFlags());
906
907  MI.eraseFromParent();
908
909  // Since the carry output of MI is now VCC, update its use in MISucc.
910
911  MISucc.substituteRegister(CarryIn->getReg(), TRI->getVCC(), 0, *TRI);
912}
913
914bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,
915                                         const GCNSubtarget &ST) const {
916  // Check if this is already an SDWA instruction
917  unsigned Opc = MI.getOpcode();
918  if (TII->isSDWA(Opc))
919    return true;
920
921  // Check if this instruction has opcode that supports SDWA
922  if (AMDGPU::getSDWAOp(Opc) == -1)
923    Opc = AMDGPU::getVOPe32(Opc);
924
925  if (AMDGPU::getSDWAOp(Opc) == -1)
926    return false;
927
928  if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
929    return false;
930
931  if (TII->isVOPC(Opc)) {
932    if (!ST.hasSDWASdst()) {
933      const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
934      if (SDst && (SDst->getReg() != AMDGPU::VCC &&
935                   SDst->getReg() != AMDGPU::VCC_LO))
936        return false;
937    }
938
939    if (!ST.hasSDWAOutModsVOPC() &&
940        (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
941         TII->hasModifiersSet(MI, AMDGPU::OpName::omod)))
942      return false;
943
944  } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) ||
945             !TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
946    return false;
947  }
948
949  if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 ||
950                           Opc == AMDGPU::V_FMAC_F32_e32 ||
951                           Opc == AMDGPU::V_MAC_F16_e32 ||
952                           Opc == AMDGPU::V_MAC_F32_e32))
953    return false;
954
955  // Check if target supports this SDWA opcode
956  if (TII->pseudoToMCOpcode(Opc) == -1)
957    return false;
958
959  // FIXME: has SDWA but require handling of implicit VCC use
960  if (Opc == AMDGPU::V_CNDMASK_B32_e32)
961    return false;
962
963  if (MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0)) {
964    if (!Src0->isReg() && !Src0->isImm())
965      return false;
966  }
967
968  if (MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1)) {
969    if (!Src1->isReg() && !Src1->isImm())
970      return false;
971  }
972
973  return true;
974}
975
976bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
977                                   const SDWAOperandsVector &SDWAOperands) {
978
979  LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
980
981  // Convert to sdwa
982  int SDWAOpcode;
983  unsigned Opcode = MI.getOpcode();
984  if (TII->isSDWA(Opcode)) {
985    SDWAOpcode = Opcode;
986  } else {
987    SDWAOpcode = AMDGPU::getSDWAOp(Opcode);
988    if (SDWAOpcode == -1)
989      SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode));
990  }
991  assert(SDWAOpcode != -1);
992
993  const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode);
994
995  // Create SDWA version of instruction MI and initialize its operands
996  MachineInstrBuilder SDWAInst =
997    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc)
998    .setMIFlags(MI.getFlags());
999
1000  // Copy dst, if it is present in original then should also be present in SDWA
1001  MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1002  if (Dst) {
1003    assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::vdst));
1004    SDWAInst.add(*Dst);
1005  } else if ((Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst))) {
1006    assert(Dst && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::sdst));
1007    SDWAInst.add(*Dst);
1008  } else {
1009    assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::sdst));
1010    SDWAInst.addReg(TRI->getVCC(), RegState::Define);
1011  }
1012
1013  // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
1014  // src0_modifiers (except for v_nop_sdwa, but it can't get here)
1015  MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1016  assert(Src0 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0) &&
1017         AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_modifiers));
1018  if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers))
1019    SDWAInst.addImm(Mod->getImm());
1020  else
1021    SDWAInst.addImm(0);
1022  SDWAInst.add(*Src0);
1023
1024  // Copy src1 if present, initialize src1_modifiers.
1025  MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1026  if (Src1) {
1027    assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1) &&
1028           AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_modifiers));
1029    if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers))
1030      SDWAInst.addImm(Mod->getImm());
1031    else
1032      SDWAInst.addImm(0);
1033    SDWAInst.add(*Src1);
1034  }
1035
1036  if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa ||
1037      SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa ||
1038      SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
1039      SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) {
1040    // v_mac_f16/32 has additional src2 operand tied to vdst
1041    MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
1042    assert(Src2);
1043    SDWAInst.add(*Src2);
1044  }
1045
1046  // Copy clamp if present, initialize otherwise
1047  assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::clamp));
1048  MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp);
1049  if (Clamp) {
1050    SDWAInst.add(*Clamp);
1051  } else {
1052    SDWAInst.addImm(0);
1053  }
1054
1055  // Copy omod if present, initialize otherwise if needed
1056  if (AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::omod)) {
1057    MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod);
1058    if (OMod) {
1059      SDWAInst.add(*OMod);
1060    } else {
1061      SDWAInst.addImm(0);
1062    }
1063  }
1064
1065  // Copy dst_sel if present, initialize otherwise if needed
1066  if (AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_sel)) {
1067    MachineOperand *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);
1068    if (DstSel) {
1069      SDWAInst.add(*DstSel);
1070    } else {
1071      SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1072    }
1073  }
1074
1075  // Copy dst_unused if present, initialize otherwise if needed
1076  if (AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_unused)) {
1077    MachineOperand *DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
1078    if (DstUnused) {
1079      SDWAInst.add(*DstUnused);
1080    } else {
1081      SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD);
1082    }
1083  }
1084
1085  // Copy src0_sel if present, initialize otherwise
1086  assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_sel));
1087  MachineOperand *Src0Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
1088  if (Src0Sel) {
1089    SDWAInst.add(*Src0Sel);
1090  } else {
1091    SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1092  }
1093
1094  // Copy src1_sel if present, initialize otherwise if needed
1095  if (Src1) {
1096    assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_sel));
1097    MachineOperand *Src1Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
1098    if (Src1Sel) {
1099      SDWAInst.add(*Src1Sel);
1100    } else {
1101      SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1102    }
1103  }
1104
1105  // Check for a preserved register that needs to be copied.
1106  auto DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
1107  if (DstUnused &&
1108      DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
1109    // We expect, if we are here, that the instruction was already in it's SDWA form,
1110    // with a tied operand.
1111    assert(Dst && Dst->isTied());
1112    assert(Opcode == static_cast<unsigned int>(SDWAOpcode));
1113    // We also expect a vdst, since sdst can't preserve.
1114    auto PreserveDstIdx = AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst);
1115    assert(PreserveDstIdx != -1);
1116
1117    auto TiedIdx = MI.findTiedOperandIdx(PreserveDstIdx);
1118    auto Tied = MI.getOperand(TiedIdx);
1119
1120    SDWAInst.add(Tied);
1121    SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1);
1122  }
1123
1124  // Apply all sdwa operand patterns.
1125  bool Converted = false;
1126  for (auto &Operand : SDWAOperands) {
1127    LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand);
1128    // There should be no intersection between SDWA operands and potential MIs
1129    // e.g.:
1130    // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
1131    // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0
1132    // v_add_u32 v3, v4, v2
1133    //
1134    // In that example it is possible that we would fold 2nd instruction into
1135    // 3rd (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that
1136    // was already destroyed). So if SDWAOperand is also a potential MI then do
1137    // not apply it.
1138    if (PotentialMatches.count(Operand->getParentInst()) == 0)
1139      Converted |= Operand->convertToSDWA(*SDWAInst, TII);
1140  }
1141  if (Converted) {
1142    ConvertedInstructions.push_back(SDWAInst);
1143  } else {
1144    SDWAInst->eraseFromParent();
1145    return false;
1146  }
1147
1148  LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
1149  ++NumSDWAInstructionsPeepholed;
1150
1151  MI.eraseFromParent();
1152  return true;
1153}
1154
1155// If an instruction was converted to SDWA it should not have immediates or SGPR
1156// operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs.
1157void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,
1158                                            const GCNSubtarget &ST) const {
1159  const MCInstrDesc &Desc = TII->get(MI.getOpcode());
1160  unsigned ConstantBusCount = 0;
1161  for (MachineOperand &Op : MI.explicit_uses()) {
1162    if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg())))
1163      continue;
1164
1165    unsigned I = Op.getOperandNo();
1166    if (Desc.operands()[I].RegClass == -1 ||
1167        !TRI->isVSSuperClass(TRI->getRegClass(Desc.operands()[I].RegClass)))
1168      continue;
1169
1170    if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() &&
1171        TRI->isSGPRReg(*MRI, Op.getReg())) {
1172      ++ConstantBusCount;
1173      continue;
1174    }
1175
1176    Register VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1177    auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1178                        TII->get(AMDGPU::V_MOV_B32_e32), VGPR);
1179    if (Op.isImm())
1180      Copy.addImm(Op.getImm());
1181    else if (Op.isReg())
1182      Copy.addReg(Op.getReg(), Op.isKill() ? RegState::Kill : 0,
1183                  Op.getSubReg());
1184    Op.ChangeToRegister(VGPR, false);
1185  }
1186}
1187
1188bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
1189  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1190
1191  if (!ST.hasSDWA() || skipFunction(MF.getFunction()))
1192    return false;
1193
1194  MRI = &MF.getRegInfo();
1195  TRI = ST.getRegisterInfo();
1196  TII = ST.getInstrInfo();
1197
1198  // Find all SDWA operands in MF.
1199  bool Ret = false;
1200  for (MachineBasicBlock &MBB : MF) {
1201    bool Changed = false;
1202    do {
1203      // Preprocess the ADD/SUB pairs so they could be SDWA'ed.
1204      // Look for a possible ADD or SUB that resulted from a previously lowered
1205      // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2
1206      // lowers the pair of instructions into e32 form.
1207      matchSDWAOperands(MBB);
1208      for (const auto &OperandPair : SDWAOperands) {
1209        const auto &Operand = OperandPair.second;
1210        MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
1211        if (PotentialMI &&
1212           (PotentialMI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
1213            PotentialMI->getOpcode() == AMDGPU::V_SUB_CO_U32_e64))
1214          pseudoOpConvertToVOP2(*PotentialMI, ST);
1215      }
1216      SDWAOperands.clear();
1217
1218      // Generate potential match list.
1219      matchSDWAOperands(MBB);
1220
1221      for (const auto &OperandPair : SDWAOperands) {
1222        const auto &Operand = OperandPair.second;
1223        MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
1224        if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) {
1225          PotentialMatches[PotentialMI].push_back(Operand.get());
1226        }
1227      }
1228
1229      for (auto &PotentialPair : PotentialMatches) {
1230        MachineInstr &PotentialMI = *PotentialPair.first;
1231        convertToSDWA(PotentialMI, PotentialPair.second);
1232      }
1233
1234      PotentialMatches.clear();
1235      SDWAOperands.clear();
1236
1237      Changed = !ConvertedInstructions.empty();
1238
1239      if (Changed)
1240        Ret = true;
1241      while (!ConvertedInstructions.empty())
1242        legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
1243    } while (Changed);
1244  }
1245
1246  return Ret;
1247}
1248