File Manager

003 File Manager

Current Path: /usr/src/contrib/llvm-project/llvm/lib/Target/AMDGPU

usr / src / contrib / llvm-project / llvm / lib / Target / AMDGPU /

📁 ..
📄 AMDGPU.h(11.46 KB)
📄 AMDGPU.td(36.97 KB)
📄 AMDGPUAliasAnalysis.cpp(5.58 KB)
📄 AMDGPUAliasAnalysis.h(3.32 KB)
📄 AMDGPUAlwaysInlinePass.cpp(4.83 KB)
📄 AMDGPUAnnotateKernelFeatures.cpp(11.94 KB)
📄 AMDGPUAnnotateUniformValues.cpp(6.13 KB)
📄 AMDGPUArgumentUsageInfo.cpp(7.66 KB)
📄 AMDGPUArgumentUsageInfo.h(4.81 KB)
📄 AMDGPUAsmPrinter.cpp(50.42 KB)
📄 AMDGPUAsmPrinter.h(5.13 KB)
📄 AMDGPUAtomicOptimizer.cpp(23.79 KB)
📄 AMDGPUCallLowering.cpp(28.66 KB)
📄 AMDGPUCallLowering.h(2.37 KB)
📄 AMDGPUCallingConv.td(7.33 KB)
📄 AMDGPUCodeGenPrepare.cpp(46.42 KB)
📄 AMDGPUCombine.td(2.79 KB)
📄 AMDGPUExportClustering.cpp(4.52 KB)
📄 AMDGPUExportClustering.h(533 B)
📄 AMDGPUFeatures.td(1.81 KB)
📄 AMDGPUFixFunctionBitcasts.cpp(1.87 KB)
📄 AMDGPUFrameLowering.cpp(1.98 KB)
📄 AMDGPUFrameLowering.h(1.39 KB)
📄 AMDGPUGISel.td(11.57 KB)
📄 AMDGPUGenRegisterBankInfo.def(5.83 KB)
📄 AMDGPUGlobalISelUtils.cpp(1.77 KB)
📄 AMDGPUGlobalISelUtils.h(2.07 KB)
📄 AMDGPUHSAMetadataStreamer.cpp(31.21 KB)
📄 AMDGPUHSAMetadataStreamer.h(5.46 KB)
📄 AMDGPUISelDAGToDAG.cpp(101.59 KB)
📄 AMDGPUISelLowering.cpp(168.65 KB)
📄 AMDGPUISelLowering.h(19.23 KB)
📄 AMDGPUInline.cpp(7.97 KB)
📄 AMDGPUInstrInfo.cpp(1.71 KB)
📄 AMDGPUInstrInfo.h(1.66 KB)
📄 AMDGPUInstrInfo.td(17.18 KB)
📄 AMDGPUInstructionSelector.cpp(128.53 KB)
📄 AMDGPUInstructionSelector.h(11.04 KB)
📄 AMDGPUInstructions.td(25.36 KB)
📄 AMDGPULegalizerInfo.cpp(149.32 KB)
📄 AMDGPULegalizerInfo.h(8.49 KB)
📄 AMDGPULibCalls.cpp(53.89 KB)
📄 AMDGPULibFunc.cpp(37.85 KB)
📄 AMDGPULibFunc.h(10.99 KB)
📄 AMDGPULowerIntrinsics.cpp(4.55 KB)
📄 AMDGPULowerKernelArguments.cpp(8.89 KB)
📄 AMDGPULowerKernelAttributes.cpp(7.78 KB)
📄 AMDGPUMCInstLower.cpp(14.27 KB)
📄 AMDGPUMachineCFGStructurizer.cpp(101.97 KB)
📄 AMDGPUMachineFunction.cpp(2.24 KB)
📄 AMDGPUMachineFunction.h(2.13 KB)
📄 AMDGPUMachineModuleInfo.cpp(1.34 KB)
📄 AMDGPUMachineModuleInfo.h(5.46 KB)
📄 AMDGPUMacroFusion.cpp(2.28 KB)
📄 AMDGPUMacroFusion.h(679 B)
📄 AMDGPUOpenCLEnqueuedBlockLowering.cpp(5.31 KB)
📄 AMDGPUPTNote.h(1.29 KB)
📄 AMDGPUPerfHintAnalysis.cpp(12.17 KB)
📄 AMDGPUPerfHintAnalysis.h(1.67 KB)
📄 AMDGPUPostLegalizerCombiner.cpp(12.02 KB)
📄 AMDGPUPreLegalizerCombiner.cpp(5.45 KB)
📄 AMDGPUPrintfRuntimeBinding.cpp(21.7 KB)
📄 AMDGPUPromoteAlloca.cpp(35.24 KB)
📄 AMDGPUPropagateAttributes.cpp(11.76 KB)
📄 AMDGPURegBankCombiner.cpp(5.36 KB)
📄 AMDGPURegisterBankInfo.cpp(161.67 KB)
📄 AMDGPURegisterBankInfo.h(7.41 KB)
📄 AMDGPURegisterBanks.td(921 B)
📄 AMDGPURewriteOutArguments.cpp(15.82 KB)
📄 AMDGPUSearchableTables.td(21.04 KB)
📄 AMDGPUSubtarget.cpp(29.62 KB)
📄 AMDGPUSubtarget.h(35.82 KB)
📄 AMDGPUTargetMachine.cpp(42.67 KB)
📄 AMDGPUTargetMachine.h(4.52 KB)
📄 AMDGPUTargetObjectFile.cpp(1.54 KB)
📄 AMDGPUTargetObjectFile.h(1.14 KB)
📄 AMDGPUTargetTransformInfo.cpp(39.07 KB)
📄 AMDGPUTargetTransformInfo.h(11.11 KB)
📄 AMDGPUUnifyDivergentExitNodes.cpp(13.84 KB)
📄 AMDGPUUnifyMetadata.cpp(4.46 KB)
📄 AMDILCFGStructurizer.cpp(56.32 KB)
📄 AMDKernelCodeT.h(32.84 KB)
📁 AsmParser
📄 BUFInstructions.td(110.75 KB)
📄 CaymanInstructions.td(7.93 KB)
📄 DSInstructions.td(52.37 KB)
📁 Disassembler
📄 EvergreenInstructions.td(28.24 KB)
📄 FLATInstructions.td(66.93 KB)
📄 GCNDPPCombine.cpp(19.92 KB)
📄 GCNHazardRecognizer.cpp(45.3 KB)
📄 GCNHazardRecognizer.h(3.96 KB)
📄 GCNILPSched.cpp(11.3 KB)
📄 GCNIterativeScheduler.cpp(20.62 KB)
📄 GCNIterativeScheduler.h(4.16 KB)
📄 GCNMinRegStrategy.cpp(8.47 KB)
📄 GCNNSAReassign.cpp(10.92 KB)
📄 GCNProcessors.td(4.84 KB)
📄 GCNRegBankReassign.cpp(26.68 KB)
📄 GCNRegPressure.cpp(16.27 KB)
📄 GCNRegPressure.h(9.15 KB)
📄 GCNSchedStrategy.cpp(21.67 KB)
📄 GCNSchedStrategy.h(3.77 KB)
📁 MCTargetDesc
📄 MIMGInstructions.td(39.85 KB)
📄 R600.td(1.51 KB)
📄 R600AsmPrinter.cpp(4.46 KB)
📄 R600AsmPrinter.h(1.5 KB)
📄 R600ClauseMergePass.cpp(7.38 KB)
📄 R600ControlFlowFinalizer.cpp(23.4 KB)
📄 R600Defines.h(4.25 KB)
📄 R600EmitClauseMarkers.cpp(12.1 KB)
📄 R600ExpandSpecialInstrs.cpp(10.11 KB)
📄 R600FrameLowering.cpp(1.83 KB)
📄 R600FrameLowering.h(1.25 KB)
📄 R600ISelLowering.cpp(81.88 KB)
📄 R600ISelLowering.h(4.8 KB)
📄 R600InstrFormats.td(11.58 KB)
📄 R600InstrInfo.cpp(49.47 KB)
📄 R600InstrInfo.h(13.7 KB)
📄 R600Instructions.td(55.13 KB)
📄 R600MachineFunctionInfo.cpp(551 B)
📄 R600MachineFunctionInfo.h(824 B)
📄 R600MachineScheduler.cpp(13.57 KB)
📄 R600MachineScheduler.h(2.53 KB)
📄 R600OpenCLImageTypeLoweringPass.cpp(11.75 KB)
📄 R600OptimizeVectorRegisters.cpp(13.4 KB)
📄 R600Packetizer.cpp(13.4 KB)
📄 R600Processors.td(4.42 KB)
📄 R600RegisterInfo.cpp(3.95 KB)
📄 R600RegisterInfo.h(2 KB)
📄 R600RegisterInfo.td(9.75 KB)
📄 R600Schedule.td(1.62 KB)
📄 R700Instructions.td(783 B)
📄 SIAddIMGInit.cpp(6.24 KB)
📄 SIAnnotateControlFlow.cpp(11.18 KB)
📄 SIDefines.h(20.86 KB)
📄 SIFixSGPRCopies.cpp(29.46 KB)
📄 SIFixVGPRCopies.cpp(2 KB)
📄 SIFixupVectorISel.cpp(8.75 KB)
📄 SIFoldOperands.cpp(54.56 KB)
📄 SIFormMemoryClauses.cpp(12.76 KB)
📄 SIFrameLowering.cpp(48.08 KB)
📄 SIFrameLowering.h(2.98 KB)
📄 SIISelLowering.cpp(423.43 KB)
📄 SIISelLowering.h(22.13 KB)
📄 SIInsertHardClauses.cpp(7.01 KB)
📄 SIInsertSkips.cpp(15.29 KB)
📄 SIInsertWaitcnts.cpp(58.33 KB)
📄 SIInstrFormats.td(9.44 KB)
📄 SIInstrInfo.cpp(247.15 KB)
📄 SIInstrInfo.h(41.24 KB)
📄 SIInstrInfo.td(90.7 KB)
📄 SIInstructions.td(77.7 KB)
📄 SILoadStoreOptimizer.cpp(76.21 KB)
📄 SILowerControlFlow.cpp(22.66 KB)
📄 SILowerI1Copies.cpp(27.83 KB)
📄 SILowerSGPRSpills.cpp(12.68 KB)
📄 SIMachineFunctionInfo.cpp(20.01 KB)
📄 SIMachineFunctionInfo.h(26.91 KB)
📄 SIMachineScheduler.cpp(69.44 KB)
📄 SIMachineScheduler.h(15.65 KB)
📄 SIMemoryLegalizer.cpp(45.84 KB)
📄 SIModeRegister.cpp(17.43 KB)
📄 SIOptimizeExecMasking.cpp(12.81 KB)
📄 SIOptimizeExecMaskingPreRA.cpp(11.13 KB)
📄 SIPeepholeSDWA.cpp(42.84 KB)
📄 SIPostRABundler.cpp(3.6 KB)
📄 SIPreAllocateWWMRegs.cpp(6.09 KB)
📄 SIPreEmitPeephole.cpp(10.51 KB)
📄 SIProgramInfo.h(2.04 KB)
📄 SIRegisterInfo.cpp(71.51 KB)
📄 SIRegisterInfo.h(13.04 KB)
📄 SIRegisterInfo.td(37.28 KB)
📄 SIRemoveShortExecBranches.cpp(4.96 KB)
📄 SISchedule.td(7.58 KB)
📄 SIShrinkInstructions.cpp(26.86 KB)
📄 SIWholeQuadMode.cpp(30.22 KB)
📄 SMInstructions.td(48.14 KB)
📄 SOPInstructions.td(60.51 KB)
📁 TargetInfo
📁 Utils
📄 VIInstrFormats.td(645 B)
📄 VOP1Instructions.td(35.53 KB)
📄 VOP2Instructions.td(65.04 KB)
📄 VOP3Instructions.td(53.14 KB)
📄 VOP3PInstructions.td(26.47 KB)
📄 VOPCInstructions.td(63.31 KB)
📄 VOPInstructions.td(23.76 KB)

Editing: SIPreEmitPeephole.cpp

//===-- SIPreEmitPeephole.cpp ------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// This pass performs the peephole optimizations before code emission.
///
//===----------------------------------------------------------------------===//

#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/Support/CommandLine.h"

using namespace llvm;

#define DEBUG_TYPE "si-pre-emit-peephole"

namespace {

class SIPreEmitPeephole : public MachineFunctionPass {
private:
  const SIInstrInfo *TII = nullptr;
  const SIRegisterInfo *TRI = nullptr;

bool optimizeVccBranch(MachineInstr &MI) const;
  bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;

public:
  static char ID;

SIPreEmitPeephole() : MachineFunctionPass(ID) {
    initializeSIPreEmitPeepholePass(*PassRegistry::getPassRegistry());
  }

bool runOnMachineFunction(MachineFunction &MF) override;
};

} // End anonymous namespace.

INITIALIZE_PASS(SIPreEmitPeephole, DEBUG_TYPE,
                "SI peephole optimizations", false, false)

char SIPreEmitPeephole::ID = 0;

char &llvm::SIPreEmitPeepholeID = SIPreEmitPeephole::ID;

bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
  // Match:
  // sreg = -1 or 0
  // vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg
  // S_CBRANCH_VCC[N]Z
  // =>
  // S_CBRANCH_EXEC[N]Z
  // We end up with this pattern sometimes after basic block placement.
  // It happens while combining a block which assigns -1 or 0 to a saved mask
  // and another block which consumes that saved mask and then a branch.
  bool Changed = false;
  MachineBasicBlock &MBB = *MI.getParent();
  const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
  const bool IsWave32 = ST.isWave32();
  const unsigned CondReg = TRI->getVCC();
  const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
  const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
  const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;

MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
                                      E = MBB.rend();
  bool ReadsCond = false;
  unsigned Threshold = 5;
  for (++A; A != E; ++A) {
    if (!--Threshold)
      return false;
    if (A->modifiesRegister(ExecReg, TRI))
      return false;
    if (A->modifiesRegister(CondReg, TRI)) {
      if (!A->definesRegister(CondReg, TRI) ||
          (A->getOpcode() != And && A->getOpcode() != AndN2))
        return false;
      break;
    }
    ReadsCond |= A->readsRegister(CondReg, TRI);
  }
  if (A == E)
    return false;

MachineOperand &Op1 = A->getOperand(1);
  MachineOperand &Op2 = A->getOperand(2);
  if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
    TII->commuteInstruction(*A);
    Changed = true;
  }
  if (Op1.getReg() != ExecReg)
    return Changed;
  if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0))
    return Changed;

int64_t MaskValue = 0;
  Register SReg;
  if (Op2.isReg()) {
    SReg = Op2.getReg();
    auto M = std::next(A);
    bool ReadsSreg = false;
    for (; M != E; ++M) {
      if (M->definesRegister(SReg, TRI))
        break;
      if (M->modifiesRegister(SReg, TRI))
        return Changed;
      ReadsSreg |= M->readsRegister(SReg, TRI);
    }
    if (M == E || !M->isMoveImmediate() || !M->getOperand(1).isImm() ||
        (M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0))
      return Changed;
    MaskValue = M->getOperand(1).getImm();
    // First if sreg is only used in the AND instruction fold the immediate
    // into into the AND.
    if (!ReadsSreg && Op2.isKill()) {
      A->getOperand(2).ChangeToImmediate(MaskValue);
      M->eraseFromParent();
    }
  } else if (Op2.isImm()) {
    MaskValue = Op2.getImm();
  } else {
    llvm_unreachable("Op2 must be register or immediate");
  }

// Invert mask for s_andn2
  assert(MaskValue == 0 || MaskValue == -1);
  if (A->getOpcode() == AndN2)
    MaskValue = ~MaskValue;

if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) &&
      MI.killsRegister(CondReg, TRI))
    A->eraseFromParent();

bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
  if (SReg == ExecReg) {
    // EXEC is updated directly
    if (IsVCCZ) {
      MI.eraseFromParent();
      return true;
    }
    MI.setDesc(TII->get(AMDGPU::S_BRANCH));
  } else if (IsVCCZ && MaskValue == 0) {
    // Will always branch
    // Remove all succesors shadowed by new unconditional branch
    MachineBasicBlock *Parent = MI.getParent();
    SmallVector<MachineInstr *, 4> ToRemove;
    bool Found = false;
    for (MachineInstr &Term : Parent->terminators()) {
      if (Found) {
        if (Term.isBranch())
          ToRemove.push_back(&Term);
      } else {
        Found = Term.isIdenticalTo(MI);
      }
    }
    assert(Found && "conditional branch is not terminator");
    for (auto BranchMI : ToRemove) {
      MachineOperand &Dst = BranchMI->getOperand(0);
      assert(Dst.isMBB() && "destination is not basic block");
      Parent->removeSuccessor(Dst.getMBB());
      BranchMI->eraseFromParent();
    }

if (MachineBasicBlock *Succ = Parent->getFallThrough()) {
      Parent->removeSuccessor(Succ);
    }

// Rewrite to unconditional branch
    MI.setDesc(TII->get(AMDGPU::S_BRANCH));
  } else if (!IsVCCZ && MaskValue == 0) {
    // Will never branch
    MachineOperand &Dst = MI.getOperand(0);
    assert(Dst.isMBB() && "destination is not basic block");
    MI.getParent()->removeSuccessor(Dst.getMBB());
    MI.eraseFromParent();
    return true;
  } else if (MaskValue == -1) {
    // Depends only on EXEC
    MI.setDesc(
        TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
  }

MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI));
  MI.addImplicitDefUseOperands(*MBB.getParent());

return true;
}

bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
                                       MachineInstr &MI) const {
  MachineBasicBlock &MBB = *MI.getParent();
  const MachineFunction &MF = *MBB.getParent();
  const MachineRegisterInfo &MRI = MF.getRegInfo();
  MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
  Register IdxReg = Idx->isReg() ? Idx->getReg() : Register();
  SmallVector<MachineInstr *, 4> ToRemove;
  bool IdxOn = true;

if (!MI.isIdenticalTo(First))
    return false;

// Scan back to find an identical S_SET_GPR_IDX_ON
  for (MachineBasicBlock::iterator I = std::next(First.getIterator()),
       E = MI.getIterator(); I != E; ++I) {
    switch (I->getOpcode()) {
    case AMDGPU::S_SET_GPR_IDX_MODE:
      return false;
    case AMDGPU::S_SET_GPR_IDX_OFF:
      IdxOn = false;
      ToRemove.push_back(&*I);
      break;
    default:
      if (I->modifiesRegister(AMDGPU::M0, TRI))
        return false;
      if (IdxReg && I->modifiesRegister(IdxReg, TRI))
        return false;
      if (llvm::any_of(I->operands(),
                       [&MRI, this](const MachineOperand &MO) {
                         return MO.isReg() &&
                                TRI->isVectorRegister(MRI, MO.getReg());
                       })) {
        // The only exception allowed here is another indirect vector move
        // with the same mode.
        if (!IdxOn ||
            !((I->getOpcode() == AMDGPU::V_MOV_B32_e32 &&
               I->hasRegisterImplicitUseOperand(AMDGPU::M0)) ||
              I->getOpcode() == AMDGPU::V_MOV_B32_indirect))
          return false;
      }
    }
  }

MI.eraseFromParent();
  for (MachineInstr *RI : ToRemove)
    RI->eraseFromParent();
  return true;
}

bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
  TII = ST.getInstrInfo();
  TRI = &TII->getRegisterInfo();
  MachineBasicBlock *EmptyMBBAtEnd = nullptr;
  bool Changed = false;

for (MachineBasicBlock &MBB : MF) {
    MachineBasicBlock::iterator MBBE = MBB.getFirstTerminator();
    MachineBasicBlock::iterator TermI = MBBE;
    // Check first terminator for VCC branches to optimize
    if (TermI != MBB.end()) {
      MachineInstr &MI = *TermI;
      switch (MI.getOpcode()) {
      case AMDGPU::S_CBRANCH_VCCZ:
      case AMDGPU::S_CBRANCH_VCCNZ:
        Changed |= optimizeVccBranch(MI);
        continue;
      default:
        break;
      }
    }
    // Check all terminators for SI_RETURN_TO_EPILOG
    // FIXME: This is not an optimization and should be moved somewhere else.
    while (TermI != MBB.end()) {
      MachineInstr &MI = *TermI;
      if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) {
        assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());

// Graphics shaders returning non-void shouldn't contain S_ENDPGM,
        // because external bytecode will be appended at the end.
        if (&MBB != &MF.back() || &MI != &MBB.back()) {
          // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block
          // at the end and jump there.
          if (!EmptyMBBAtEnd) {
            EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
            MF.insert(MF.end(), EmptyMBBAtEnd);
          }

MBB.addSuccessor(EmptyMBBAtEnd);
          BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
              .addMBB(EmptyMBBAtEnd);
          MI.eraseFromParent();
          MBBE = MBB.getFirstTerminator();
          TermI = MBBE;
          continue;
        }
      }
      TermI++;
    }

if (!ST.hasVGPRIndexMode())
      continue;

MachineInstr *SetGPRMI = nullptr;
    const unsigned Threshold = 20;
    unsigned Count = 0;
    // Scan the block for two S_SET_GPR_IDX_ON instructions to see if a
    // second is not needed. Do expensive checks in the optimizeSetGPR()
    // and limit the distance to 20 instructions for compile time purposes.
    for (MachineBasicBlock::iterator MBBI = MBB.begin(); MBBI != MBBE; ) {
      MachineInstr &MI = *MBBI;
      ++MBBI;

if (Count == Threshold)
        SetGPRMI = nullptr;
      else
        ++Count;

if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON)
        continue;

Count = 0;
      if (!SetGPRMI) {
        SetGPRMI = &MI;
        continue;
      }

if (optimizeSetGPR(*SetGPRMI, MI))
        Changed = true;
      else
        SetGPRMI = &MI;
    }
  }

return Changed;
}

003 File Manager

Editing: SIPreEmitPeephole.cpp

Upload File

Create Folder