diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 5df11a45b4889..b46ee685f674f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -45,6 +45,8 @@ FunctionPass *createSIWholeQuadModeLegacyPass(); FunctionPass *createSIFixControlFlowLiveIntervalsPass(); FunctionPass *createSIOptimizeExecMaskingPreRAPass(); FunctionPass *createSIOptimizeVGPRLiveRangeLegacyPass(); +FunctionPass *createAMDGPUNextUseAnalysisPass(); +FunctionPass *createAMDGPUEarlyRegisterSpillingPass(); FunctionPass *createSIFixSGPRCopiesLegacyPass(); FunctionPass *createLowerWWMCopiesPass(); FunctionPass *createSIMemoryLegalizerPass(); @@ -191,6 +193,12 @@ extern char &SIFixSGPRCopiesLegacyID; void initializeSIFixVGPRCopiesLegacyPass(PassRegistry &); extern char &SIFixVGPRCopiesID; +void initializeAMDGPUNextUseAnalysisPassPass(PassRegistry &); +extern char &AMDGPUNextUseAnalysisID; + +void initializeAMDGPUEarlyRegisterSpillingPass(PassRegistry &); +extern char &AMDGPUEarlyRegisterSpillingID; + void initializeSILowerWWMCopiesLegacyPass(PassRegistry &); extern char &SILowerWWMCopiesLegacyID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUEarlyRegisterSpilling.cpp b/llvm/lib/Target/AMDGPU/AMDGPUEarlyRegisterSpilling.cpp new file mode 100644 index 0000000000000..1891ac6197b32 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUEarlyRegisterSpilling.cpp @@ -0,0 +1,862 @@ +//===------------------- AMDGPUEarlyRegisterSpilling.cpp -----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUEarlyRegisterSpilling.h" +#include "AMDGPU.h" +#include "AMDGPUNextUseAnalysis.h" +#include "GCNSubtarget.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/InitializePasses.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-early-register-spilling" + +STATISTIC(NumOfERSSpills, "Number of ERS spills"); + +static cl::opt EarlyRegisterSpilling("early-register-spilling", + cl::init(true), cl::Hidden); + +// TODO: Remove this flag. +static cl::opt + VGPRMaxNums("max-vgprs", cl::init(0), cl::Hidden, + cl::desc("The maximum number of VGPRs per wave.")); + +// TODO: Preserve SlotIndexes analysis in getAnalysisUsage() +void AMDGPUEarlyRegisterSpilling::updateIndexes(MachineInstr *MI) { + if (Indexes->hasIndex(*MI)) + Indexes->removeMachineInstrFromMaps(*MI); + Indexes->insertMachineInstrInMaps(*MI); +} + +// TODO: Preserve LiveIntervals analysis in getAnalysisUsage() +void AMDGPUEarlyRegisterSpilling::updateLiveness(Register Reg) { + if (LIS->hasInterval(Reg)) + LIS->removeInterval(Reg); + LIS->createAndComputeVirtRegInterval(Reg); +} + +void AMDGPUEarlyRegisterSpilling::updateLiveness(MachineInstr *MI) { + for (auto &MO : MI->operands()) { + if (!MO.isReg()) + continue; + auto Reg = MO.getReg(); + if (!Reg.isVirtual()) + continue; + if (LIS->hasInterval(Reg)) + LIS->removeInterval(Reg); + LIS->createAndComputeVirtRegInterval(Reg); + } +} + +// We need this because it does not make sense to spill a def which has a use in +// a phi at the beginning of a basic block and it is defined a bit later. +bool AMDGPUEarlyRegisterSpilling::hasPHIUseInSameBB(Register Reg, + MachineBasicBlock *CurMBB) { + for (auto &UseMI : MRI->use_nodbg_instructions(Reg)) + if (UseMI.isPHI() && UseMI.getParent() == CurMBB) + return true; + return false; +} + +MachineInstr * +AMDGPUEarlyRegisterSpilling::emitRestore(Register DefRegToSpill, + MachineInstr *DefRegUseInstr, int FI) { + const TargetRegisterClass *RC = TRI->getRegClassForReg(*MRI, DefRegToSpill); + Register NewReg = MRI->createVirtualRegister(RC); + RestoredRegs.insert(NewReg); + MachineBasicBlock *DefRegUseInstrBB = DefRegUseInstr->getParent(); + MachineInstr *Restore = nullptr; + assert(DefRegUseInstr->getOpcode() != AMDGPU::PHI && + "We cannot emit a restore instruction before a phi node"); + TII->loadRegFromStackSlot(*DefRegUseInstrBB, DefRegUseInstr->getIterator(), + NewReg, FI, RC, 0); + Restore = DefRegUseInstr->getPrevNode(); + DefRegUseInstr->substituteRegister(DefRegToSpill, NewReg, 0, *TRI); + LIS->InsertMachineInstrInMaps(*Restore); + LLVM_DEBUG(dbgs() << "Emit restore before use: " << *DefRegUseInstr); + LLVM_DEBUG(dbgs() << "Restore instruction = " << *Restore); + LLVM_DEBUG(dbgs() << "Restore block = " << Restore->getParent()->getName() + << "\n"); + LLVM_DEBUG(dbgs() << "Register to replace spilled register = " + << printReg(NewReg, TRI) << "\n"); + return Restore; +} + +MachineInstr * +AMDGPUEarlyRegisterSpilling::emitRestore(Register DefRegToSpill, + MachineBasicBlock &InsertBB, int FI) { + const TargetRegisterClass *RC = TRI->getRegClassForReg(*MRI, DefRegToSpill); + Register NewReg = MRI->createVirtualRegister(RC); + RestoredRegs.insert(NewReg); + auto It = InsertBB.getFirstTerminator(); + if (It == InsertBB.end()) + It = InsertBB.instr_end(); + TII->loadRegFromStackSlot(*&InsertBB, It, NewReg, FI, RC, 0); + MachineInstr *Restore = &*(std::prev(It)); + LIS->InsertMachineInstrInMaps(*Restore); + LLVM_DEBUG(dbgs() << "Restore instruction = " << *Restore); + LLVM_DEBUG(dbgs() << "Emit restore at the end of basic block: = " + << Restore->getParent()->getName() << "\n"); + LLVM_DEBUG(dbgs() << "Register to replace spilled register = " + << printReg(NewReg, TRI) << "\n"); + return Restore; +} + +// TODO: Tune this check to improve spilling. +bool AMDGPUEarlyRegisterSpilling::isLegalToSpill(Register CandidateReg) { + assert(MRI->hasOneDef(CandidateReg) && + "The Register does not have one definition"); + MachineInstr *CandidateMI = MRI->getOneDef(CandidateReg)->getParent(); + return !hasPHIUseInSameBB(CandidateReg, CandidateMI->getParent()) && + !MRI->use_nodbg_empty(CandidateReg) && + !TII->isVGPRSpill(CandidateMI->getOpcode()) && + !isSpilledReg(CandidateReg) && !isRestoredReg(CandidateReg) && + !CandidateReg.isPhysical() && !TRI->isAGPR(*MRI, CandidateReg) && + !CandidateMI->isTerminator() && TRI->isVGPR(*MRI, CandidateReg); +} + +SmallVector AMDGPUEarlyRegisterSpilling::getRegistersToSpill( + MachineInstr *CurMI, GCNDownwardRPTracker &RPTracker) { + MachineBasicBlock *CurMBB = CurMI->getParent(); + bool IsCurMIInLoop = MLI->getLoopFor(CurMI->getParent()); + SmallVector> RegCandidates; + MachineLoop *OutermostLoop = nullptr; + double LoopDistance = 0; + LLVM_DEBUG(dbgs() << "===========================================\n"); + if (IsCurMIInLoop) { + OutermostLoop = MLI->getLoopFor(CurMI->getParent())->getOutermostLoop(); + auto [DistanceFromHeaderToExitingLatch, ExitingLatch] = + NUA.getLoopDistanceAndExitingLatch(OutermostLoop->getHeader()); + LoopDistance = + (DistanceFromHeaderToExitingLatch + ExitingLatch->size()) * LoopWeight; + } + + for (auto [CandidateReg, Mask] : RPTracker.getLiveRegs()) { + + if (!isLegalToSpill(CandidateReg)) + continue; + + assert(MRI->hasOneDef(CandidateReg) && + "The Register does not have one definition"); + MachineInstr *CandidateMI = MRI->getOneDef(CandidateReg)->getParent(); + MachineLoop *CandidateLoop = MLI->getLoopFor(CandidateMI->getParent()); + bool IsLoopCandidate = + IsCurMIInLoop && + (!CandidateLoop || (CandidateLoop && OutermostLoop && + ((CandidateLoop != OutermostLoop) || + !OutermostLoop->contains(CandidateLoop)))); + + if (IsCurMIInLoop && !IsLoopCandidate) + continue; + + SmallVector Uses; + for (auto &UseMI : MRI->use_nodbg_instructions(CandidateReg)) { + MachineBasicBlock *UseMBB = UseMI.getParent(); + if (isReachable(CurMBB, UseMBB) || + (CurMBB == UseMBB && DT->dominates(CurMI, &UseMI))) + Uses.push_back(&UseMI); + } + + if (Uses.empty()) + continue; + + auto NextUseDist = NUA.getNextUseDistance(CandidateReg, CurMI, Uses); + + if (!IsCurMIInLoop) { + // If CurMI is not in a loop, then we collect the registers that we + // can spill based on their next-use-distance from CurMI in + // 'RegCandidates'. + RegCandidates.push_back(std::make_pair(CandidateReg, *NextUseDist)); + LLVM_DEBUG(dbgs() << "Candidate register to spill = " + << printReg(CandidateReg, TRI) + << " with distance = " << *NextUseDist << "\n"); + } else if (IsLoopCandidate && (NextUseDist > LoopDistance)) { + // Collect only the live-through values. + RegCandidates.push_back(std::make_pair(CandidateReg, *NextUseDist)); + LLVM_DEBUG(dbgs() << "Candidate register to spill = " + << printReg(CandidateReg, TRI) + << " with distance = " << *NextUseDist << "\n"); + } + } + + LLVM_DEBUG(dbgs() << "==========================================\n"); + if (RegCandidates.empty()) + return {}; + + // Return the registers with the longest next-use distance. + llvm::sort(RegCandidates, [](auto &Pair1, auto &Pair2) { + return Pair1.second > Pair2.second; + }); + + SmallVector RegistersToSpill(llvm::make_first_range(RegCandidates)); + for (auto P : RegCandidates) + RegistersToSpill.push_back(P.first); + + return RegistersToSpill; +} + +// Helper function for finding the incoming blocks that are related to +// DefRegToSpill +static SmallVector +getPhiBlocksOfSpillReg(MachineInstr *UseMI, Register DefRegToSpill) { + assert(UseMI->isPHI() && "The use is not phi instruction"); + SmallVector Blocks; + auto Ops = UseMI->operands(); + for (auto It = std::next(Ops.begin()), ItE = Ops.end(); It != ItE; + It = std::next(It, 2)) { + auto &RegMO = *It; + if (RegMO.isUndef()) + continue; + auto &MBBMO = *std::next(It); + assert(RegMO.isReg() && "Expected register operand of PHI"); + assert(MBBMO.isMBB() && "Expected MBB operand of PHI"); + if (RegMO.getReg() == DefRegToSpill) + Blocks.push_back(MBBMO.getMBB()); + } + return Blocks; +} + +// TODO: check if the common dominator of restores is profitable +bool AMDGPUEarlyRegisterSpilling::shouldEmitRestoreInCommonDominator( + MachineBasicBlock *SpillBlock, MachineBasicBlock *CurMBB, + MachineBasicBlock *CommonDominatorToRestore) { + if (SpillBlock == CommonDominatorToRestore) + return false; + if (CurMBB == CommonDominatorToRestore) + return false; + if (DT->dominates(CommonDominatorToRestore, SpillBlock)) + return false; + if (isReachable(CommonDominatorToRestore, SpillBlock)) + return false; + if (!DT->dominates(SpillBlock, CommonDominatorToRestore)) + return false; + if (MLI->getLoopFor(CommonDominatorToRestore)) + return false; + return true; +} + +/// Helper data structure for grouping together uses where the head of the group +/// dominates all the other uses in the group. +class DomGroup { + SmallVector Uses; + SmallVector UseBlocks; + MachineBasicBlock *CommonDominator = nullptr; + bool Deleted = false; + +public: + DomGroup(MachineInstr *MI, MachineBasicBlock *RestoreBlock) { + Uses.push_back(MI); + UseBlocks.push_back(RestoreBlock); + } + MachineInstr *getHead() const { return Uses.front(); } + bool isDeleted() const { return Deleted; } + void merge(DomGroup &Other) { + for (auto *MI : Other.Uses) + Uses.push_back(MI); + + for (auto *UseMBB : Other.UseBlocks) + UseBlocks.push_back(UseMBB); + + Other.Deleted = true; + } + const auto &getUses() const { return Uses; } + const auto &getUseBlocks() const { return UseBlocks; } + size_t size() const { return Uses.size(); } + void setCommonDominator(MachineBasicBlock *CD) { CommonDominator = CD; } + MachineBasicBlock *getCommonDominator() const { return CommonDominator; } + bool hasCommonDominator() const { return CommonDominator != nullptr; } + MachineBasicBlock *getRestoreBlock() const { return UseBlocks.front(); } +}; + +void AMDGPUEarlyRegisterSpilling::emitRestoreInstrsForDominatedUses( + Register DefRegToSpill, MachineInstr *SpillInstruction, MachineInstr *CurMI, + SetVectorType &DominatedUses, SmallVector &RestoreInstrs, + SmallVector &RestoreUses, int FI) { + MachineBasicBlock *SpillBlock = SpillInstruction->getParent(); + MachineBasicBlock *CurMBB = CurMI->getParent(); + MachineLoop *SpillLoop = MLI->getLoopFor(SpillBlock); + assert(!SpillLoop && "There should not be a spill loop."); + + std::vector Groups; + for (auto *Use : DominatedUses) { + MachineLoop *UseLoop = MLI->getLoopFor(Use->getParent()); + if (UseLoop) { + // If a use is in a loop then the restore instruction is emitted in the + // outermost loop's preheader. + MachineLoop *OutermostLoop = UseLoop->getOutermostLoop(); + MachineBasicBlock *OutermostLoopPreheader = + OutermostLoop->getLoopPreheader(); + Groups.emplace_back(Use, OutermostLoopPreheader); + } else if (Use->isPHI()) { + // In case of phi nodes, the restore instructions are emitted at the + // bottom of the incoming blocks. + for (MachineBasicBlock *PhiOpMBB : + getPhiBlocksOfSpillReg(Use, DefRegToSpill)) { + Groups.emplace_back(Use, PhiOpMBB); + } + } else { + // Emit restore before Use. + Groups.emplace_back(Use, Use->getParent()); + } + } + + // Our goal is to emit as few restores as possible by avoiding emitting + // restore instructions if an earlier restore can be reused. + // + // Create groups of instructions where the group head dominates the rest in + // the group. In addition, we check if we can find an eligible common + // dominator where we can emit the restore instruction. + // + // In the following example, there are two groups. The first group consists of + // the uses in BB3 and BB5 and the second group consists of the uses in BB4 + // and BB6. The head of the first group is the use in BB3 and the head of the + // second group is the use in BB4. + // + // BB1 + // r1 = ... + // | + // BB2 + // spill r1 <-- high register pressure block + // / \ + // BB3 BB4 + // r2 = restore r1 r3 = restore r1 + // ... = r2 ... = r3 + // | | + // BB5 BB6 + // ... = r2 ... = r3 + // + // In the following example, we emit the restore instruction in the common + // dominator of the two uses in BB4 and BB5. + // BB1 + // r1 = ... + // | + // BB2 + // spill r1 <-- high register pressure block + // | + // BB3 + // r2 = restore r1 + // / \ + // BB4 BB5 + // ... = r2 ... = r2 + // + for (unsigned Idx1 = 0, E = Groups.size(); Idx1 != E; ++Idx1) { + auto &G1 = Groups[Idx1]; + if (G1.isDeleted()) + continue; + for (unsigned Idx2 = Idx1 + 1; Idx2 < E; ++Idx2) { + auto &G2 = Groups[Idx2]; + if (G2.isDeleted()) + continue; + + MachineInstr *Head1 = G1.getHead(); + MachineInstr *Head2 = G2.getHead(); + MachineBasicBlock *RestoreBlock1 = G1.getRestoreBlock(); + MachineBasicBlock *RestoreBlock2 = G2.getRestoreBlock(); + SmallVector UseBlocks; + for (auto *Block : G1.getUseBlocks()) + UseBlocks.push_back(Block); + + for (auto *Block : G2.getUseBlocks()) + UseBlocks.push_back(Block); + + MachineBasicBlock *CommonDom = DT->findNearestCommonDominator( + make_range(UseBlocks.begin(), UseBlocks.end())); + + if ((RestoreBlock1 != RestoreBlock2) && + shouldEmitRestoreInCommonDominator(SpillBlock, CurMBB, CommonDom)) { + // Set a common dominator if the two restore blocks are different. + G1.merge(G2); + G1.setCommonDominator(CommonDom); + } else if (DT->dominates(Head1, Head2) && !G1.getCommonDominator() && + !G2.getCommonDominator()) { + // If there is no common dominator and one Head dominates the other, + // then we can merge the two groups. + G1.merge(G2); + } + } + } + + // For each group emit one restore for the group header in the parent block of + // the group header or the common dominator. The rest of the uses in the group + // will reuse the value loaded by the restore of the header. + for (auto &G1 : Groups) { + if (G1.isDeleted()) + continue; + MachineInstr *Head = G1.getHead(); + MachineBasicBlock *HeadMBB = G1.getRestoreBlock(); + MachineInstr *Restore = nullptr; + if (G1.hasCommonDominator()) { + MachineBasicBlock *CommonDominator = G1.getCommonDominator(); + MachineInstr *UseInCommonDominator = nullptr; + for (auto *U : G1.getUses()) { + if (U->getParent() == CommonDominator) { + if (UseInCommonDominator) { + if (DT->dominates(U, UseInCommonDominator)) + UseInCommonDominator = U; + } else { + UseInCommonDominator = U; + } + } + } + if (UseInCommonDominator) { + Restore = emitRestore(DefRegToSpill, UseInCommonDominator, FI); + Head = UseInCommonDominator; + HeadMBB = CommonDominator; + } else { + Restore = emitRestore(DefRegToSpill, *CommonDominator, FI); + Head->substituteRegister(DefRegToSpill, Restore->getOperand(0).getReg(), + 0, *TRI); + } + } else if (Head->isPHI()) { + Restore = emitRestore(DefRegToSpill, *HeadMBB, FI); + Head->substituteRegister(DefRegToSpill, Restore->getOperand(0).getReg(), + 0, *TRI); + } else if (MLI->getLoopFor(Head->getParent())) { + Restore = emitRestore(DefRegToSpill, *HeadMBB, FI); + Head->substituteRegister(DefRegToSpill, Restore->getOperand(0).getReg(), + 0, *TRI); + } else { + Restore = emitRestore(DefRegToSpill, Head, FI); + } + RestoreInstrs.push_back(Restore); + RestoreUses.push_back(Head); + + // Update the rest of the uses in the group to reuse the value restored by + // the head of the group. + for (auto *U : G1.getUses()) { + assert(U != SpillInstruction); + if (U == Head) + continue; + + U->substituteRegister(DefRegToSpill, Restore->getOperand(0).getReg(), 0, + *TRI); + RestoreUses.push_back(U); + LLVM_DEBUG(dbgs() << "Updated use: " << *U); + LLVM_DEBUG(dbgs() << "With register = " + << printReg(Restore->getOperand(0).getReg(), TRI) + << "\n"); + } + } +} + +SetVectorType AMDGPUEarlyRegisterSpilling::collectUsesThatNeedRestoreInstrs( + Register DefRegToSpill, MachineInstr *SpillInstruction, + const SetVectorType &UnreachableUses) { + SetVectorType DominatedUses; + MachineBasicBlock *SpillBlock = SpillInstruction->getParent(); + for (MachineInstr &U : MRI->use_nodbg_instructions(DefRegToSpill)) { + if (&U == SpillInstruction) + continue; + + if (UnreachableUses.contains(&U)) + continue; + + if (U.isPHI()) { + for (auto *PhiOpMBB : getPhiBlocksOfSpillReg(&U, DefRegToSpill)) { + if (DT->dominates(SpillBlock, PhiOpMBB)) { + DominatedUses.insert(&U); + } + } + } else if (DT->dominates(SpillInstruction, &U)) { + DominatedUses.insert(&U); + } + } + return DominatedUses; +} + +void AMDGPUEarlyRegisterSpilling::emitRestores( + Register DefRegToSpill, MachineInstr *CurMI, MachineInstr *SpillInstruction, + const SetVectorType &UnreachableUses, const TargetRegisterClass *RC, + int FI) { + assert(MRI->hasOneDef(DefRegToSpill) && + "The Register does not have one definition"); + MachineInstr *InstrOfDefRegToSpill = + MRI->getOneDef(DefRegToSpill)->getParent(); + + // Collect the uses that are dominated by SpillInstruction + SetVectorType DominatedUses = collectUsesThatNeedRestoreInstrs( + DefRegToSpill, SpillInstruction, UnreachableUses); + + SmallVector RestoreInstrs; + SmallVector RestoreUses; + emitRestoreInstrsForDominatedUses(DefRegToSpill, SpillInstruction, CurMI, + DominatedUses, RestoreInstrs, RestoreUses, + FI); + + // Update the live interval analysis. + updateIndexes(InstrOfDefRegToSpill); + updateIndexes(SpillInstruction); + updateLiveness(InstrOfDefRegToSpill); + updateLiveness(SpillInstruction); + + if (InstrOfDefRegToSpill != CurMI) { + updateIndexes(CurMI); + updateLiveness(CurMI); + } + + for (auto *Use : RestoreInstrs) { + updateIndexes(Use); + updateLiveness(Use); + } + + for (auto *Use : RestoreUses) { + updateIndexes(Use); + updateLiveness(Use); + } +} + +// We have to collect the unreachable uses before we emit the spill instruction. +// This is due to the fact that some unreachable uses might become reachable if +// we spill in common dominator. +std::pair +AMDGPUEarlyRegisterSpilling::collectNonDominatedReachableAndUnreachableUses( + MachineBasicBlock *SpillBlock, Register DefRegToSpill, + MachineInstr *CurMI) { + // The reachable uses are the ones that can be reached by the SpillBlock. + SetVectorType NonDominatedReachableUses; + // The non-dominated uses are the uses that cannot be reached by the + // SpillBlock. + SetVectorType UnreachableUses; + for (MachineInstr &U : MRI->use_nodbg_instructions(DefRegToSpill)) { + if (U.isPHI()) { + for (auto *PhiOpMBB : getPhiBlocksOfSpillReg(&U, DefRegToSpill)) { + if (DT->dominates(SpillBlock, PhiOpMBB)) + continue; + if (isReachable(SpillBlock, PhiOpMBB)) { + if (!DT->dominates(SpillBlock, PhiOpMBB)) + NonDominatedReachableUses.insert(&U); + } else if (PhiOpMBB != SpillBlock) + UnreachableUses.insert(&U); + } + } else { + MachineBasicBlock *UMBB = U.getParent(); + if (DT->dominates(CurMI, &U)) + continue; + + if (isReachable(SpillBlock, UMBB)) { + if (!DT->dominates(SpillBlock, UMBB)) + NonDominatedReachableUses.insert(&U); + } else + UnreachableUses.insert(&U); + } + } + return {NonDominatedReachableUses, UnreachableUses}; +} + +// Find the common dominator of the reachable uses and the block that we +// intend to spill(SpillBlock). +MachineBasicBlock *AMDGPUEarlyRegisterSpilling::findCommonDominatorToSpill( + MachineBasicBlock *SpillBlock, Register DefRegToSpill, + const SetVectorType &NonDominatedReachableUses) { + SmallPtrSet Blocks; + for (auto *RU : NonDominatedReachableUses) { + if (RU->isPHI()) { + for (auto *PhiOpMBB : getPhiBlocksOfSpillReg(RU, DefRegToSpill)) + Blocks.insert(PhiOpMBB); + } else + Blocks.insert(RU->getParent()); + } + + Blocks.insert(SpillBlock); + MachineBasicBlock *CommonDominatorToSpill = + DT->findNearestCommonDominator(make_range(Blocks.begin(), Blocks.end())); + + return CommonDominatorToSpill; +} + +std::pair +AMDGPUEarlyRegisterSpilling::getWhereToSpill(MachineInstr *CurMI, + Register DefRegToSpill) { + assert(MRI->hasOneDef(DefRegToSpill) && + "The Register does not have one definition"); + MachineInstr *InstrOfDefRegToSpill = + MRI->getOneDef(DefRegToSpill)->getParent(); + MachineBasicBlock *DefRegMBB = InstrOfDefRegToSpill->getParent(); + MachineBasicBlock *CurMBB = CurMI->getParent(); + MachineLoop *DefInstrLoop = MLI->getLoopFor(DefRegMBB); + MachineLoop *CurLoop = MLI->getLoopFor(CurMI->getParent()); + // We do not spill inside the loop nest because of the spill overhead. So, + // we only need to know about the outermost loop. + if (CurLoop) + CurLoop = CurLoop->getOutermostLoop(); + + MachineBasicBlock *SpillBlock = nullptr; + MachineBasicBlock::iterator WhereToSpill; + // case 1: + // - the register we are about to spill (DefRegToSpill) is defined in loop + // - the high register pressure (CurMI) is outside the loop + // - we emit the spill instruction in one of the exit blocks of the loop + // TODO: improve spilling in loops + if ((DefInstrLoop && !CurLoop) || + (DefInstrLoop && CurLoop && + ((DefInstrLoop != CurLoop) || (!DefInstrLoop->contains(CurLoop) && + !CurLoop->contains(DefInstrLoop))))) { + SmallVector ExitBlocks; + MachineLoop *OutermostLoop = DefInstrLoop->getOutermostLoop(); + OutermostLoop->getUniqueExitBlocks(ExitBlocks); + assert(ExitBlocks.size() == 1 && "There should be only one exit basic " + "block after CFG structurization"); + MachineBasicBlock *ExitBB = ExitBlocks.back(); + if (!DT->dominates(ExitBB, CurMBB)) + return {}; + if (ExitBB == CurMBB) { + WhereToSpill = CurMI->getIterator(); + SpillBlock = ExitBB; + } else { + WhereToSpill = ExitBB->getFirstTerminator(); + if (WhereToSpill == ExitBB->end()) + WhereToSpill = ExitBB->instr_end(); + SpillBlock = ExitBB; + } + } + // case 2: + // - the register we are about to spill is outside the loop + // - the high register pressure instruction (CurMI) is inside the loop + // - we emit the spill instruction in the loop preheader + else if (!DefInstrLoop && CurLoop) { + MachineBasicBlock *CurLoopPreheader = CurLoop->getLoopPreheader(); + assert(CurLoopPreheader && "There is not loop preheader"); + WhereToSpill = CurLoopPreheader->getFirstTerminator(); + if (WhereToSpill == CurLoopPreheader->end()) + WhereToSpill = CurLoopPreheader->back(); + SpillBlock = CurLoopPreheader; + } + // case 3: + // - the high register pressure instruction is a PHI node + // - we emit the spill instruction before the first non-PHI instruction + else if (CurMI->isPHI()) { + WhereToSpill = CurMBB->getFirstNonPHI(); + SpillBlock = CurMBB; + } + // case 4: + // - the high register pressure instruction is also the instruction that + // defines the register we are about to spill + // - we emit the spill instruction after the high reg pressure instr + else if (CurMI == InstrOfDefRegToSpill) { + WhereToSpill = std::next(CurMI->getIterator()); + SpillBlock = CurMBB; + } + // case 5: + // - this is the general case. We spill just before the instruction where + // we detect high register pressure. + else { + WhereToSpill = CurMI->getIterator(); + SpillBlock = CurMBB; + } + return {SpillBlock, WhereToSpill}; +} + +void AMDGPUEarlyRegisterSpilling::spill(MachineInstr *CurMI, + GCNDownwardRPTracker &RPTracker, + unsigned NumOfSpills) { + // CurMI indicates the point of the code where there is high register + // pressure. + MachineBasicBlock *CurMBB = CurMI->getParent(); + unsigned SpillCnt = 0; + for (Register DefRegToSpill : getRegistersToSpill(CurMI, RPTracker)) { + if (SpillCnt >= NumOfSpills) + break; + + assert(MRI->hasOneDef(DefRegToSpill) && + "The Register does not have one definition"); + MachineInstr *InstrOfDefRegToSpill = + MRI->getOneDef(DefRegToSpill)->getParent(); + MachineBasicBlock *DefRegMBB = InstrOfDefRegToSpill->getParent(); + // Sanity check that the spilled register is defined before the high + // register pressure point + assert((!DT->dominates(CurMI, InstrOfDefRegToSpill) || + !isReachable(CurMBB, DefRegMBB)) && + "This register should not be spilled"); + const TargetRegisterClass *RC = TRI->getRegClassForReg(*MRI, DefRegToSpill); + unsigned Size = TRI->getSpillSize(*RC); + Align Alignment = TRI->getSpillAlign(*RC); + int FI = FrameInfo->CreateSpillStackObject(Size, Alignment); + + // First we find where we should emit the spill instruction. + MachineInstr *SpillInstruction = nullptr; + MachineBasicBlock *SpillBlock = nullptr; + MachineBasicBlock::iterator WhereToSpill; + std::tie(SpillBlock, WhereToSpill) = getWhereToSpill(CurMI, DefRegToSpill); + if (SpillBlock == nullptr) + continue; + // The next step is to check if there are any uses which are reachable + // from the SpillBlock. In this case, we have to emit the spill in the + // common dominator of the SpillBlock and the blocks of the reachable + // uses. + SetVectorType NonDominatedReachableUses; + SetVectorType UnreachableUses; + std::tie(NonDominatedReachableUses, UnreachableUses) = + collectNonDominatedReachableAndUnreachableUses(SpillBlock, + DefRegToSpill, CurMI); + MachineBasicBlock *CommonDominatorToSpill = nullptr; + if (!NonDominatedReachableUses.empty()) + CommonDominatorToSpill = findCommonDominatorToSpill( + SpillBlock, DefRegToSpill, NonDominatedReachableUses); + if (CommonDominatorToSpill && CommonDominatorToSpill != SpillBlock) { + SpillBlock = CommonDominatorToSpill; + WhereToSpill = SpillBlock->getFirstTerminator(); + if (WhereToSpill == SpillBlock->end()) + WhereToSpill = SpillBlock->instr_end(); + } + // Emit the spill instruction. + TII->storeRegToStackSlot(*SpillBlock, WhereToSpill, DefRegToSpill, + true, /* kill */ + FI, RC, 0); + SpillInstruction = &*(std::prev(WhereToSpill)); + // Maintain live intervals. + LIS->InsertMachineInstrInMaps(*SpillInstruction); + + SpilledRegs.insert(DefRegToSpill); + NumOfERSSpills++; + SpillCnt++; + + assert(SpillInstruction && "There is not a spill instruction"); + LLVM_DEBUG(dbgs() << "High register pressure point = " << *CurMI); + LLVM_DEBUG(dbgs() << "Register to spill = " << printReg(DefRegToSpill, TRI) + << "\n"); + LLVM_DEBUG(dbgs() << "Spill instruction = " << *SpillInstruction); + LLVM_DEBUG(dbgs() << "Spill block = " + << SpillInstruction->getParent()->getName() << "\n"); + + // Find the restore locations, emit the restore instructions and maintain + // SSA when needed. + emitRestores(DefRegToSpill, CurMI, SpillInstruction, UnreachableUses, RC, + FI); + } + // Reset the tracker because it has already read the next instruction which + // we might have modified by emitting a spill or restore instruction. + RPTracker.reset(*CurMI); + RPTracker.advance(); +} + +GCNRegPressure +AMDGPUEarlyRegisterSpilling::getMaxPressure(const MachineFunction &MF) { + GCNRegPressure MaxPressure; + GCNUpwardRPTracker RPTracker(*LIS); + for (auto &MBB : MF) { + GCNRegPressure BBMaxPressure; + + if (!MBB.empty()) { + RPTracker.reset(MBB.instr_back()); + for (auto &MI : reverse(MBB)) + RPTracker.recede(MI); + + BBMaxPressure = RPTracker.getMaxPressureAndReset(); + } + MaxPressure = max(BBMaxPressure, MaxPressure); + } + return MaxPressure; +} + +bool AMDGPUEarlyRegisterSpilling::runOnMachineFunction(MachineFunction &MF) { + + if (skipFunction(MF.getFunction())) + return false; + + if (!EarlyRegisterSpilling) + return false; + + const GCNSubtarget &ST = MF.getSubtarget(); + TII = ST.getInstrInfo(); + TRI = &TII->getRegisterInfo(); + MLI = &getAnalysis().getLI(); + MRI = &MF.getRegInfo(); + MFI = MF.getInfo(); + FrameInfo = &MF.getFrameInfo(); + LIS = &getAnalysis().getLIS(); + Indexes = &getAnalysis().getSI(); + DT = &getAnalysis().getDomTree(); + NUA.run(MF, MLI); + + unsigned VgprNum = getMaxPressure(MF).getVGPRNum(ST.hasGFX90AInsts()); + unsigned Occupancy = + ST.getOccupancyWithNumVGPRs(VgprNum, ST.getDynamicVGPRBlockSize()); + unsigned MaxVGPRs = + VGPRMaxNums > 0 + ? VGPRMaxNums + : std::min(ST.getMaxNumVGPRs(Occupancy, ST.getDynamicVGPRBlockSize()), + ST.getMaxNumVGPRs(MF)); + + LLVM_DEBUG(dbgs() << "===========================================\n"); + LLVM_DEBUG(dbgs() << "Early Register Spilling\n"); + LLVM_DEBUG(dbgs() << "===========================================\n"); + LLVM_DEBUG(dbgs() << MF.getName() << "\n"); + LLVM_DEBUG(dbgs() << "MaxVGPRs = " << MaxVGPRs << "\n"); + + GCNDownwardRPTracker RPTracker(*LIS); + ReversePostOrderTraversal RPOT(&MF); + + for (MachineBasicBlock *MBB : RPOT) { + if (MBB->empty()) + continue; + + // Initialize the Register Pressure Tracker at the beginning of the + // block. + RPTracker.reset(*MBB->begin()); + RPTracker.advance(); + + // Iterate over the instructions of MBB and check if the live registers + // are more than the available registers. + for (auto It = MBB->begin(), ItE = MBB->end(); It != ItE; ++It) { + MachineInstr *MI = &*It; + + if (MI->isDebugInstr()) + continue; + + if (!TII->isVGPRSpill(MI->getOpcode()) && !MI->isBranch()) { + + const MachineInstr *LastTrackedMI = RPTracker.getLastTrackedMI(); + assert(MI == LastTrackedMI && "The tracker and the loop iteration " + "should visit the same instruction."); + unsigned VGPRLiveRegs = RPTracker.getPressure().getVGPRNum(false); + // Spill if the live VGPR registers are more than the available + // VGPRs. + if (VGPRLiveRegs > MaxVGPRs) { + unsigned NumOfSpills = VGPRLiveRegs - MaxVGPRs; + spill(MI, RPTracker, NumOfSpills); + } + } + + // Move the tracker to the next instruction. + // If we have reached the bottom of a basic block, then we have to + // initialize the tracker at the beginning of the next basic block. + if (MI == &MBB->back()) + continue; + + // Phi nodes might include registers that are defined later in the + // code. Hence, we have to initialize the tracker again. + if (MI->getOpcode() == AMDGPU::PHI) { + RPTracker.reset(*MI->getNextNode()); + } + RPTracker.advance(); + } + } + + clearTables(); + return true; +} + +char AMDGPUEarlyRegisterSpilling::ID = 0; + +INITIALIZE_PASS_BEGIN(AMDGPUEarlyRegisterSpilling, DEBUG_TYPE, + "Early Register Spilling", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) +INITIALIZE_PASS_END(AMDGPUEarlyRegisterSpilling, DEBUG_TYPE, + "Early Register Spilling", false, false) + +char &llvm::AMDGPUEarlyRegisterSpillingID = AMDGPUEarlyRegisterSpilling::ID; + +FunctionPass *llvm::createAMDGPUEarlyRegisterSpillingPass() { + return new AMDGPUEarlyRegisterSpilling(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUEarlyRegisterSpilling.h b/llvm/lib/Target/AMDGPU/AMDGPUEarlyRegisterSpilling.h new file mode 100644 index 0000000000000..2dcef2a4cfe0d --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUEarlyRegisterSpilling.h @@ -0,0 +1,166 @@ +//===------------------- AMDGPUEarlyRegisterSpilling.h --------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements Early Register Spilling. +// +// This is based on ideas from the paper: +// "Register Spilling and Live-Range Splitting for SSA-Form Programs" +// Matthias Braun and Sebastian Hack, CC'09 +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUEARLYREGISTERSPILLING_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUEARLYREGISTERSPILLING_H + +#include "AMDGPUNextUseAnalysis.h" +#include "GCNRegPressure.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/SlotIndexes.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" + +using namespace llvm; + +using SetVectorType = SmallSetVector; + +class AMDGPUEarlyRegisterSpilling : public MachineFunctionPass { + const SIRegisterInfo *TRI = nullptr; + const SIInstrInfo *TII = nullptr; + const MachineLoopInfo *MLI = nullptr; + MachineRegisterInfo *MRI = nullptr; + const SIMachineFunctionInfo *MFI = nullptr; + MachineFrameInfo *FrameInfo = nullptr; + LiveIntervals *LIS = nullptr; + SlotIndexes *Indexes = nullptr; + MachineDominatorTree *DT = nullptr; + + AMDGPUNextUseAnalysis NUA; + /// Keep the registers that are spilled. + DenseSet SpilledRegs; + /// Keep the output registers of the restored instructions. + DenseSet RestoredRegs; + /// Similar to next-use distance analysis, we assume an approximate trip count + /// of 1000 for all loops. + static constexpr const double LoopWeight = 1000.0; + + /// Check if it is legal to spill \p CandidateReg e.g. is not a physical + /// register. + bool isLegalToSpill(Register CandidateReg); + + /// Return the registers with the longest next-use distance that we need to + /// spill. + SmallVector getRegistersToSpill(MachineInstr *CurMI, + GCNDownwardRPTracker &RPTracker); + + /// Return where we have to spill the DefRegToSpill. + std::pair + getWhereToSpill(MachineInstr *CurMI, Register DefRegToSpill); + + /// Return the uses that need a restore instruction. + SetVectorType + collectUsesThatNeedRestoreInstrs(Register DefRegToSpill, + MachineInstr *SpillInstruction, + const SetVectorType &UnreachableUses); + + /// Find the restore locations, emit the restore instructions and maintain + /// SSA when needed. + void emitRestores(Register DefRegToSpill, MachineInstr *CurMI, + MachineInstr *SpillInstruction, + const SetVectorType &UnreachableUses, + const TargetRegisterClass *RC, int FI); + + /// Main spill function that emits the spill and restore code. + void spill(MachineInstr *CurMI, GCNDownwardRPTracker &RPTracker, + unsigned NumOfSpills); + + /// Emit restore instruction where it is needed + MachineInstr *emitRestore(Register SpillReg, MachineInstr *UseMI, int FI); + /// Emit restore instruction at the end of a basic block. + MachineInstr *emitRestore(Register SpillReg, MachineBasicBlock &InsertBB, + int FI); + + /// Emit restore instructions for each group that contains the uses that are + /// dominated by the head of the group. + void emitRestoreInstrsForDominatedUses( + Register DefRegToSpill, MachineInstr *SpillInstruction, + MachineInstr *CurMI, SetVectorType &DominatedUses, + SmallVector &RestoreInstrs, + SmallVector &RestoreUses, int FI); + + /// Check if it is legal or profitable to emit a restore in the common + /// dominator. + bool shouldEmitRestoreInCommonDominator( + MachineBasicBlock *SpillBlock, MachineBasicBlock *CurMBB, + MachineBasicBlock *CommonDominatorToRestore); + + /// Find the common dominator of the reachable uses and the block that we + /// intend to spill. + MachineBasicBlock * + findCommonDominatorToSpill(MachineBasicBlock *SpillBlock, + Register DefRegToSpill, + const SetVectorType &NonDominatedReachableUses); + + /// Collect Non Dominated Reachable and Unreachable uses. + std::pair + collectNonDominatedReachableAndUnreachableUses(MachineBasicBlock *SpillBlock, + Register DefRegToSpill, + MachineInstr *CurMI); + + /// Helper functions to update the live interval analysis which is used by + /// the Register Pressure Tracker. + void updateIndexes(MachineInstr *MI); + void updateLiveness(Register Reg); + void updateLiveness(MachineInstr *MI); + + bool hasPHIUseInSameBB(Register Reg, MachineBasicBlock *MBB); + + /// Calculate the initial maximum register pressure per basic block (before + /// any spilling) because it gives us the maximum number of VGPRs and SGPRs. + GCNRegPressure getMaxPressure(const MachineFunction &MF); + + bool isSpilledReg(Register Reg) { return SpilledRegs.contains(Reg); } + + bool isRestoredReg(Register Reg) { return RestoredRegs.contains(Reg); } + + void clearTables() { + SpilledRegs.clear(); + RestoredRegs.clear(); + } + + bool isReachable(MachineBasicBlock *From, MachineBasicBlock *To) { + return NUA.getShortestDistance(From, To) != + std::numeric_limits::max(); + } + +public: + static char ID; + + AMDGPUEarlyRegisterSpilling() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &) override; + + StringRef getPassName() const override { + return "AMDGPU Early Register Spilling"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUEARLYREGISTERSPILLING_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUNextUseAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUNextUseAnalysis.cpp new file mode 100644 index 0000000000000..a413b097b961a --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUNextUseAnalysis.cpp @@ -0,0 +1,588 @@ +//===---------------------- AMDGPUNextUseAnalysis.cpp --------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUNextUseAnalysis.h" +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "llvm/InitializePasses.h" +#include +#include +#include + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-next-use-analysis" + +static cl::opt DumpNextUseDistance("dump-next-use-distance", + cl::init(false), cl::Hidden); + +bool AMDGPUNextUseAnalysis::isBackedge(MachineBasicBlock *From, + MachineBasicBlock *To) const { + if (!From->isSuccessor(To)) + return false; + MachineLoop *Loop1 = MLI->getLoopFor(From); + MachineLoop *Loop2 = MLI->getLoopFor(To); + if (!Loop1 || !Loop2 || Loop1 != Loop2) + return false; + MachineBasicBlock *LoopHeader = Loop1->getHeader(); + if (To != LoopHeader) + return false; + SmallVector Latches; + Loop1->getLoopLatches(Latches); + auto It = llvm::find(Latches, From); + return It != Latches.end(); +} + +// Calculate the shortest distance between two blocks using Dijkstra algorithm. +double AMDGPUNextUseAnalysis::getShortestPath(MachineBasicBlock *FromMBB, + MachineBasicBlock *ToMBB) { + assert(FromMBB != ToMBB && "The basic blocks should be different.\n"); + DenseSet Visited; + struct Data { + MachineBasicBlock *BestPred = nullptr; + double ShortestDistance = std::numeric_limits::max(); + }; + DenseMap MBBData; + + auto Cmp = [&MBBData](MachineBasicBlock *MBB1, MachineBasicBlock *MBB2) { + return MBBData[MBB1].ShortestDistance > MBBData[MBB2].ShortestDistance; + }; + std::priority_queue, + decltype(Cmp)> + Worklist(Cmp); + + Worklist.push(FromMBB); + MBBData[FromMBB] = {nullptr, 0.0}; + + while (!Worklist.empty()) { + MachineBasicBlock *CurMBB = Worklist.top(); + Worklist.pop(); + + if (!Visited.insert(CurMBB).second) + continue; + + if (CurMBB == ToMBB) { + auto *Pred = MBBData[CurMBB].BestPred; + return MBBData[Pred].ShortestDistance - MBBData[FromMBB].ShortestDistance; + } + + auto Pair = MBBData.try_emplace( + CurMBB, Data{nullptr, std::numeric_limits::max()}); + double CurrMBBDist = Pair.first->second.ShortestDistance; + + for (MachineBasicBlock *Succ : CurMBB->successors()) { + if (isBackedge(CurMBB, Succ)) + continue; + + auto GetEffectiveLoopDepth = [&](MachineBasicBlock *BB) -> double { + MachineLoop *LoopBB = MLI->getLoopFor(BB); + double LoopDepth = 0.0; + for (MachineLoop *TmpLoop = LoopBB, + *End = LoopBB->getOutermostLoop()->getParentLoop(); + TmpLoop != End; TmpLoop = TmpLoop->getParentLoop()) { + if (TmpLoop->contains(ToMBB)) + continue; + LoopDepth++; + } + return LoopDepth; + }; + + auto GetLoopWeight = [&](MachineBasicBlock *BB) -> double { + MachineLoop *LoopBB = MLI->getLoopFor(BB); + MachineLoop *LoopTo = MLI->getLoopFor(ToMBB); + if (!LoopBB && !LoopTo) + return 0.0; + + if (LoopBB && LoopTo && + (LoopTo->contains(LoopBB) && (LoopTo != LoopBB))) + return std::pow(LoopWeight, + static_cast(MLI->getLoopDepth(BB) - + MLI->getLoopDepth(ToMBB))); + + if ((LoopBB && LoopTo && LoopBB->contains(LoopTo))) + return 1.0; + + if ((!LoopTo && LoopBB) || + (LoopBB && LoopTo && !LoopTo->contains(LoopBB))) + return std::pow(LoopWeight, GetEffectiveLoopDepth(BB)); + + return 0.0; + }; + + auto GetWeightedSize = [&](MachineBasicBlock *BB) -> double { + double LoopWeight = GetLoopWeight(BB); + if (LoopWeight != 0.0) + return static_cast(BB->size()) * LoopWeight; + return static_cast(BB->size()); + }; + double NewSuccDist = CurrMBBDist + GetWeightedSize(Succ); + + auto &[SuccPred, SuccDist] = MBBData[Succ]; + if (NewSuccDist < SuccDist) { + // We found a better path to Succ, update best predecessor and distance + SuccPred = CurMBB; + SuccDist = NewSuccDist; + } + + Worklist.push(Succ); + } + } + return std::numeric_limits::max(); +} + +void AMDGPUNextUseAnalysis::calculateShortestPaths(MachineFunction &MF) { + for (MachineBasicBlock &MBB1 : MF) { + for (MachineBasicBlock &MBB2 : MF) { + if (&MBB1 == &MBB2) + continue; + ShortestPathTable[std::make_pair(&MBB1, &MBB2)] = + getShortestPath(&MBB1, &MBB2); + } + } +} + +double AMDGPUNextUseAnalysis::calculateShortestDistance(MachineInstr *CurMI, + MachineInstr *UseMI) { + MachineBasicBlock *CurMBB = CurMI->getParent(); + MachineBasicBlock *UseMBB = UseMI->getParent(); + + if (CurMBB == UseMBB) + return getInstrId(UseMI) - getInstrId(CurMI); + + double CurMIDistanceToBBEnd = + getInstrId(&*(std::prev(CurMBB->instr_end()))) - getInstrId(CurMI); + double UseDistanceFromBBBegin = getInstrId(&*(UseMI->getIterator())) - + getInstrId(&*(UseMBB->instr_begin())) + 1; + double Dst = getShortestDistanceFromTable(CurMBB, UseMBB); + assert(Dst != std::numeric_limits::max()); + return CurMIDistanceToBBEnd + Dst + UseDistanceFromBBBegin; +} + +std::pair +AMDGPUNextUseAnalysis::getShortestDistanceToExitingLatch( + MachineBasicBlock *CurMBB, MachineLoop *CurLoop) const { + SmallVector Latches; + CurLoop->getLoopLatches(Latches); + double ShortestDistanceToLatch = std::numeric_limits::max(); + MachineBasicBlock *ExitingLatch = nullptr; + + for (MachineBasicBlock *LMBB : Latches) { + if (LMBB == CurMBB) + return std::make_pair(0.0, CurMBB); + + double Dst = getShortestDistanceFromTable(CurMBB, LMBB); + if (ShortestDistanceToLatch > Dst) { + ShortestDistanceToLatch = Dst; + ExitingLatch = LMBB; + } + } + return std::make_pair(ShortestDistanceToLatch, ExitingLatch); +} + +std::pair +AMDGPUNextUseAnalysis::getLoopDistanceAndExitingLatch( + MachineBasicBlock *CurMBB) const { + MachineLoop *CurLoop = MLI->getLoopFor(CurMBB); + MachineBasicBlock *LoopHeader = CurLoop->getHeader(); + SmallVector Latches; + CurLoop->getLoopLatches(Latches); + bool IsCurLoopLatch = llvm::any_of( + Latches, [&](MachineBasicBlock *LMBB) { return CurMBB == LMBB; }); + MachineBasicBlock *ExitingLatch = nullptr; + double DistanceToLatch = 0.0; + double TotalDistance = 0.0; + + if (CurLoop->getNumBlocks() == 1) + return std::make_pair(static_cast(CurMBB->size()), CurMBB); + + if (CurMBB == LoopHeader) { + std::tie(DistanceToLatch, ExitingLatch) = + getShortestDistanceToExitingLatch(CurMBB, CurLoop); + TotalDistance = static_cast(LoopHeader->size()) + DistanceToLatch + + static_cast(ExitingLatch->size()); + return std::make_pair(TotalDistance, ExitingLatch); + } + + if (IsCurLoopLatch) { + TotalDistance = static_cast(LoopHeader->size()) + + getShortestDistanceFromTable(LoopHeader, CurMBB) + + static_cast(CurMBB->size()); + return std::make_pair(TotalDistance, CurMBB); + } + + double LoopHeaderToCurMBBDistance = + getShortestDistanceFromTable(LoopHeader, CurMBB); + + std::tie(DistanceToLatch, ExitingLatch) = + getShortestDistanceToExitingLatch(CurMBB, CurLoop); + + TotalDistance = static_cast(LoopHeader->size()) + + LoopHeaderToCurMBBDistance + + static_cast(CurMBB->size()) + DistanceToLatch + + static_cast(ExitingLatch->size()); + return std::make_pair(TotalDistance, ExitingLatch); +} + +// Calculates the overhead of a loop nest for three cases: 1. the use is outside +// of the current loop, but they share the same loop nest 2. the use is +// outside of the current loop nest and 3. the use is in a parent loop of the +// current loop nest. +std::pair +AMDGPUNextUseAnalysis::getNestedLoopDistanceAndExitingLatch( + MachineBasicBlock *CurMBB, MachineBasicBlock *UseMBB, + bool IsUseOutsideOfTheCurrentLoopNest, bool IsUseInParentLoop) { + MachineLoop *CurLoop = MLI->getLoopFor(CurMBB); + MachineLoop *UseLoop = MLI->getLoopFor(UseMBB); + + auto GetLoopDistance = + [&](MachineLoop *ML) -> std::pair { + double ShortestDistance = 0.0; + double LoopDistance = 0.0; + MachineBasicBlock *ExitingLatch = nullptr; + double UseLoopDepth = !IsUseOutsideOfTheCurrentLoopNest + ? static_cast(MLI->getLoopDepth(UseMBB)) + : 0.0; + if (ML->getNumBlocks() == 1) { + ShortestDistance = + static_cast(ML->getHeader()->size()) * + std::pow(LoopWeight, + (static_cast(MLI->getLoopDepth(ML->getHeader())) - + UseLoopDepth)); + return std::make_pair(ShortestDistance, ML->getLoopLatch()); + } + std::tie(LoopDistance, ExitingLatch) = + getLoopDistanceAndExitingLatch(ML->getHeader()); + ShortestDistance = LoopDistance * LoopWeight; + return std::make_pair(ShortestDistance, ExitingLatch); + }; + + if (IsUseOutsideOfTheCurrentLoopNest) { + MachineLoop *OutermostLoop = CurLoop->getOutermostLoop(); + if (OutermostLoop->contains(UseLoop)) { + // The CurLoop and the UseLoop are independent and they are in the same + // loop nest. + if (MLI->getLoopDepth(CurMBB) <= MLI->getLoopDepth(UseMBB)) { + return GetLoopDistance(CurLoop); + } else { + assert(CurLoop != OutermostLoop && "The loop cannot be the outermost."); + MachineLoop *OuterLoopOfCurLoop = CurLoop; + while (OutermostLoop != OuterLoopOfCurLoop && + MLI->getLoopDepth(OuterLoopOfCurLoop->getHeader()) != + MLI->getLoopDepth(UseMBB)) { + OuterLoopOfCurLoop = OuterLoopOfCurLoop->getParentLoop(); + } + return GetLoopDistance(OuterLoopOfCurLoop); + } + } else { + // We should take into consideration the whole loop nest in the + // calculation of the distance because we will reach the use after + // executing the whole loop nest. + return GetLoopDistance(OutermostLoop); + } + } else if (IsUseInParentLoop) { + MachineLoop *UseLoopSubLoop = nullptr; + for (MachineLoop *ML : UseLoop->getSubLoopsVector()) { + // All the sub-loops of the UseLoop will be executed before the use. + // Hence, we should take this into consideration in distance calculation. + if (ML->contains(CurLoop)) { + UseLoopSubLoop = ML; + break; + } + } + return GetLoopDistance(UseLoopSubLoop); + } + llvm_unreachable("Failed to calculate the loop distance!"); +} + +double AMDGPUNextUseAnalysis::calculateCurLoopDistance(Register DefReg, + MachineInstr *CurMI, + MachineInstr *UseMI) { + MachineBasicBlock *CurMBB = CurMI->getParent(); + MachineBasicBlock *UseMBB = UseMI->getParent(); + MachineLoop *CurLoop = MLI->getLoopFor(CurMBB); + MachineLoop *UseLoop = MLI->getLoopFor(UseMBB); + double LoopDistance = 0.0; + MachineBasicBlock *ExitingLatch = nullptr; + bool IsUseInParentLoop = CurLoop && UseLoop && + (UseLoop->contains(CurLoop) && (UseLoop != CurLoop)); + + bool IsUseOutsideOfTheCurrentLoopNest = + (!UseLoop && CurLoop) || + (CurLoop && UseLoop && !UseLoop->contains(CurLoop) && + !CurLoop->contains(UseLoop)); + + if (IsUseOutsideOfTheCurrentLoopNest) { + if (CurLoop->getSubLoops().empty() && CurLoop->isOutermost()) { + std::tie(LoopDistance, ExitingLatch) = + getLoopDistanceAndExitingLatch(CurMBB); + LoopDistance = LoopDistance * LoopWeight; + } else { + std::tie(LoopDistance, ExitingLatch) = + getNestedLoopDistanceAndExitingLatch(CurMBB, UseMBB, true, false); + } + } else if (IsUseInParentLoop) { + assert(MLI->getLoopDepth(UseMBB) < MLI->getLoopDepth(CurMBB) && + "The loop depth of the current instruction must be bigger than " + "these.\n"); + if (isIncomingValFromBackedge(CurMI, UseMI, DefReg)) + return calculateBackedgeDistance(CurMI, UseMI); + + // Get the loop distance of all the inner loops of UseLoop. + std::tie(LoopDistance, ExitingLatch) = + getNestedLoopDistanceAndExitingLatch(CurMBB, UseMBB, false, true); + } + + double UseDistanceFromBBBegin = getInstrId(&*(UseMI->getIterator())) - + getInstrId(&*(UseMBB->instr_begin())) + 1; + return LoopDistance + getShortestDistanceFromTable(ExitingLatch, UseMBB) + + UseDistanceFromBBBegin; +} + +double AMDGPUNextUseAnalysis::calculateBackedgeDistance(MachineInstr *CurMI, + MachineInstr *UseMI) { + MachineBasicBlock *CurMBB = CurMI->getParent(); + MachineBasicBlock *UseMBB = UseMI->getParent(); + MachineLoop *CurLoop = MLI->getLoopFor(CurMBB); + MachineLoop *UseLoop = MLI->getLoopFor(UseMBB); + assert(UseLoop && "There is no backedge.\n"); + double CurMIDistanceToBBEnd = + getInstrId(&*(std::prev(CurMBB->instr_end()))) - getInstrId(CurMI); + double UseDistanceFromBBBegin = getInstrId(&*(UseMI->getIterator())) - + getInstrId(&*(UseMBB->instr_begin())) + 1; + + if (!CurLoop) + return CurMIDistanceToBBEnd + getShortestDistanceFromTable(CurMBB, UseMBB) + + UseDistanceFromBBBegin; + + if (CurLoop == UseLoop) { + auto [DistanceToLatch, ExitingLatch] = + getShortestDistanceToExitingLatch(CurMBB, CurLoop); + if (ExitingLatch == CurMBB) + return CurMIDistanceToBBEnd + UseDistanceFromBBBegin; + return UseDistanceFromBBBegin + CurMIDistanceToBBEnd + DistanceToLatch + + static_cast(ExitingLatch->size()); + } + + if (!CurLoop->contains(UseLoop) && !UseLoop->contains(CurLoop)) { + auto [LoopDistance, ExitingLatch] = getLoopDistanceAndExitingLatch(CurMBB); + return LoopDistance + getShortestDistanceFromTable(ExitingLatch, UseMBB) + + UseDistanceFromBBBegin; + } + + if (!CurLoop->contains(UseLoop)) { + auto [InnerLoopDistance, InnerLoopExitingLatch] = + getNestedLoopDistanceAndExitingLatch(CurMBB, UseMBB, false, true); + auto [DistanceToLatch, ExitingLatch] = + getShortestDistanceToExitingLatch(InnerLoopExitingLatch, UseLoop); + return InnerLoopDistance + DistanceToLatch + + static_cast(ExitingLatch->size()) + UseDistanceFromBBBegin; + } + + llvm_unreachable("The backedge distance has not been calculated!"); +} + +bool AMDGPUNextUseAnalysis::isIncomingValFromBackedge(MachineInstr *CurMI, + MachineInstr *UseMI, + Register DefReg) const { + if (!UseMI->isPHI()) + return false; + + MachineLoop *CurLoop = MLI->getLoopFor(CurMI->getParent()); + MachineLoop *UseLoop = MLI->getLoopFor(UseMI->getParent()); + + if (!UseLoop) + return false; + + if (CurLoop && !UseLoop->contains(CurLoop)) + return false; + + if (UseMI->getParent() != UseLoop->getHeader()) + return false; + + SmallVector Latches; + UseLoop->getLoopLatches(Latches); + + bool IsNotIncomingValFromLatch = false; + bool IsIncomingValFromLatch = false; + auto Ops = UseMI->operands(); + for (auto It = std::next(Ops.begin()), ItE = Ops.end(); It != ItE; + It = std::next(It, 2)) { + auto &RegMO = *It; + auto &MBBMO = *std::next(It); + assert(RegMO.isReg() && "Expected register operand of PHI"); + assert(MBBMO.isMBB() && "Expected MBB operand of PHI"); + if (RegMO.getReg() == DefReg) { + MachineBasicBlock *IncomingBB = MBBMO.getMBB(); + auto It = llvm::find(Latches, IncomingBB); + if (It == Latches.end()) + IsNotIncomingValFromLatch = true; + else + IsIncomingValFromLatch = true; + } + } + return IsIncomingValFromLatch && !IsNotIncomingValFromLatch; +} + +void AMDGPUNextUseAnalysis::dumpShortestPaths() const { + for (const auto &P : ShortestPathTable) { + MachineBasicBlock *From = P.first.first; + MachineBasicBlock *To = P.first.second; + double Dist = P.second; + errs() << "From: " << From->getName() << "-> To:" << To->getName() << " = " + << Dist << "\n"; + } +} + +void AMDGPUNextUseAnalysis::printAllDistances(MachineFunction &MF) { + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : *&MBB) { + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isReg()) + continue; + + Register Reg = MO.getReg(); + if (!MO.isReg()) + continue; + + if (MO.isUse()) + continue; + + if (Reg.isPhysical() || TRI->isAGPR(*MRI, Reg)) + continue; + + std::optional NextUseDistance = getNextUseDistance(Reg); + errs() << "Next-use distance of Register " << printReg(Reg, TRI) + << " = "; + if (NextUseDistance) + errs() << format("%.1f", *NextUseDistance); + else + errs() << "null"; + errs() << "\n"; + } + } + } +} + + +// TODO: Remove it. It is only used for testing. +std::optional +AMDGPUNextUseAnalysis::getNextUseDistance(Register DefReg) { + assert(!DefReg.isPhysical() && !TRI->isAGPR(*MRI, DefReg) && + "Next-use distance is calculated for SGPRs and VGPRs"); + double NextUseDistance = std::numeric_limits::max(); + double CurrentNextUseDistance = std::numeric_limits::max(); + MachineInstr *CurMI = &*MRI->def_instr_begin(DefReg); + MachineBasicBlock *CurMBB = CurMI->getParent(); + MachineLoop *CurLoop = MLI->getLoopFor(CurMBB); + for (auto &UseMI : MRI->use_nodbg_instructions(DefReg)) { + MachineBasicBlock *UseMBB = UseMI.getParent(); + MachineLoop *UseLoop = MLI->getLoopFor(UseMBB); + + bool IsUseOutsideOfTheDefinitionLoop = + (CurLoop && !UseLoop) || + (CurLoop && UseLoop && + ((!UseLoop->contains(CurLoop) && !CurLoop->contains(UseLoop)) || + (UseLoop->contains(CurLoop) && (UseLoop != CurLoop)))); + + if (IsUseOutsideOfTheDefinitionLoop) { + CurrentNextUseDistance = calculateCurLoopDistance(DefReg, CurMI, &UseMI); + } else if (isIncomingValFromBackedge(CurMI, &UseMI, DefReg)) { + CurrentNextUseDistance = calculateBackedgeDistance(CurMI, &UseMI); + } else { + CurrentNextUseDistance = calculateShortestDistance(CurMI, &UseMI); + } + + if (CurrentNextUseDistance < NextUseDistance) + NextUseDistance = CurrentNextUseDistance; + } + return NextUseDistance != std::numeric_limits::max() + ? std::optional(NextUseDistance) + : std::nullopt; +} + +std::optional +AMDGPUNextUseAnalysis::getNextUseDistance(Register DefReg, MachineInstr *CurMI, + SmallVector &Uses) { + assert(!DefReg.isPhysical() && !TRI->isAGPR(*MRI, DefReg) && + "Next-use distance is calculated for SGPRs and VGPRs"); + double NextUseDistance = std::numeric_limits::max(); + double CurrentNextUseDistance = std::numeric_limits::max(); + MachineBasicBlock *CurMBB = CurMI->getParent(); + MachineLoop *CurLoop = MLI->getLoopFor(CurMBB); + for (auto *UseMI : Uses) { + MachineBasicBlock *UseMBB = UseMI->getParent(); + MachineLoop *UseLoop = MLI->getLoopFor(UseMBB); + + bool IsUseOutsideOfCurLoop = + (CurLoop && !UseLoop) || + (CurLoop && UseLoop && + ((!UseLoop->contains(CurLoop) && !CurLoop->contains(UseLoop)) || + (UseLoop->contains(CurLoop) && (UseLoop != CurLoop)))); + + if (IsUseOutsideOfCurLoop) { + CurrentNextUseDistance = calculateCurLoopDistance(DefReg, CurMI, UseMI); + } else if (isIncomingValFromBackedge(CurMI, UseMI, DefReg)) { + CurrentNextUseDistance = calculateBackedgeDistance(CurMI, UseMI); + } else { + CurrentNextUseDistance = calculateShortestDistance(CurMI, UseMI); + } + + if (CurrentNextUseDistance < NextUseDistance) + NextUseDistance = CurrentNextUseDistance; + } + return NextUseDistance != std::numeric_limits::max() + ? std::optional(NextUseDistance) + : std::nullopt; +} + +bool AMDGPUNextUseAnalysis::run(MachineFunction &MF, + const MachineLoopInfo *MLInfo) { + + const GCNSubtarget &ST = MF.getSubtarget(); + TII = ST.getInstrInfo(); + TRI = &TII->getRegisterInfo(); + MLI = MLInfo; + MRI = &MF.getRegInfo(); + + for (MachineBasicBlock &BB : MF) { + double Id = 0.0; + for (MachineInstr &MI : BB) { + InstrToId[&MI] = ++Id; + } + } + + calculateShortestPaths(MF); + + if (DumpNextUseDistance) { + MF.print(errs()); + printAllDistances(MF); + } + + return true; +} + +bool AMDGPUNextUseAnalysisPass::runOnMachineFunction(MachineFunction &MF) { + MLI = &getAnalysis().getLI(); + AMDGPUNextUseAnalysis NUA; + return NUA.run(MF, MLI); +} + +char AMDGPUNextUseAnalysisPass::ID = 0; + +INITIALIZE_PASS_BEGIN(AMDGPUNextUseAnalysisPass, DEBUG_TYPE, + "Next Use Analysis", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LiveVariablesWrapperPass) +INITIALIZE_PASS_END(AMDGPUNextUseAnalysisPass, DEBUG_TYPE, "Next Use Analysis", + false, false) + +char &llvm::AMDGPUNextUseAnalysisID = AMDGPUNextUseAnalysisPass::ID; + +FunctionPass *llvm::createAMDGPUNextUseAnalysisPass() { + return new AMDGPUNextUseAnalysisPass(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUNextUseAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUNextUseAnalysis.h new file mode 100644 index 0000000000000..25edf06d2afcf --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUNextUseAnalysis.h @@ -0,0 +1,161 @@ +//===---------------------- AMDGPUNextUseAnalysis.h ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements Next Use Analysis. +// For each register it goes over all uses and returns the estimated distance of +// the nearest use. This will be used for selecting which registers to spill +// before register allocation. +// +// This is based on ideas from the paper: +// "Register Spilling and Live-Range Splitting for SSA-Form Programs" +// Matthias Braun and Sebastian Hack, CC'09 +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUNEXTUSEANALYSIS_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUNEXTUSEANALYSIS_H + +#include "SIInstrInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include + +using namespace llvm; + +class AMDGPUNextUseAnalysis { + const SIRegisterInfo *TRI = nullptr; + const SIInstrInfo *TII = nullptr; + const MachineLoopInfo *MLI = nullptr; + MachineRegisterInfo *MRI = nullptr; + /// Instruction to instruction-id map. + DenseMap InstrToId; + /// Returns MI's instruction ID. It renumbers (part of) the BB if MI is not + /// found in the map. + double getInstrId(MachineInstr *MI) { + auto It = InstrToId.find(MI); + if (It != InstrToId.end()) + return It->second; + // Renumber the MBB. + // TODO: Renumber from MI onwards. + auto *MBB = MI->getParent(); + double Id = 0.0; + for (auto &MBBMI : *MBB) + InstrToId[&MBBMI] = Id++; + return InstrToId.find(MI)->second; + } + /// [FromMBB, ToMBB] to shortest distance map. + DenseMap, double> + ShortestPathTable; + /// We assume an approximate trip count of 1000 for all loops. + static constexpr const double LoopWeight = 1000.0; + /// Returns the shortest ditance from ShortestPathTable. Will crash if + /// {FromMBB,ToMBB} not found. + double getShortestDistanceFromTable(MachineBasicBlock *FromMBB, + MachineBasicBlock *ToMBB) const { + auto It = ShortestPathTable.find({FromMBB, ToMBB}); + assert(It != ShortestPathTable.end() && "Not found in table!"); + return It->second; + } + bool isBackedge(MachineBasicBlock *From, MachineBasicBlock *To) const; + double getShortestPath(MachineBasicBlock *From, MachineBasicBlock *To); + /// Goes over all MBB pairs in \p MF, calculates the shortest path between + /// them and fills in \p ShortestPathTable. + void calculateShortestPaths(MachineFunction &MF); + /// If the path from \p MI to \p UseMI does not cross any loops, then this + /// \returns the shortest instruction distance between them. + double calculateShortestDistance(MachineInstr *MI, MachineInstr *UseMI); + /// /Returns the shortest distance between a given basic block \p CurMBB and + /// its closest exiting latch of \p CurLoop. + std::pair + getShortestDistanceToExitingLatch(MachineBasicBlock *CurMBB, + MachineLoop *CurLoop) const; + /// Helper function for calculating the minimum instruction distance from the + /// outer loop header to the outer loop latch. + std::pair getNestedLoopDistanceAndExitingLatch( + MachineBasicBlock *CurMBB, MachineBasicBlock *UseMBB, + bool IsUseOutsideOfTheCurLoopNest = false, + bool IsUseInParentLoop = false); + /// Given \p CurMI in a loop and \p UseMI outside the loop, this function + /// returns the minimum instruction path between \p CurMI and \p UseMI. + /// Please note that since \p CurMI is in a loop we don't care about the + /// exact position of the instruction in the block because we are making a + /// rough estimate of the dynamic instruction path length, given that the loop + /// iterates multiple times. + double calculateCurLoopDistance(Register DefReg, MachineInstr *CurMI, + MachineInstr *UseMI); + /// \Returns the shortest path distance from \p CurMI to the end of the loop + /// latch plus the distance from the top of the loop header to the PHI use. + double calculateBackedgeDistance(MachineInstr *CurMI, MachineInstr *UseMI); + /// \Returns true if the use of \p DefReg (\p UseMI) is a PHI in the loop + /// header, i.e., DefReg is flowing through the back-edge. + bool isIncomingValFromBackedge(MachineInstr *CurMI, MachineInstr *UseMI, + Register DefReg) const; + + void dumpShortestPaths() const; + + void printAllDistances(MachineFunction &); + + void clearTables() { + InstrToId.clear(); + ShortestPathTable.clear(); + } + +public: + AMDGPUNextUseAnalysis() = default; + + ~AMDGPUNextUseAnalysis() { clearTables(); } + + bool run(MachineFunction &, const MachineLoopInfo *); + + /// \Returns the next-use distance for \p DefReg. + std::optional getNextUseDistance(Register DefReg); + + std::optional getNextUseDistance(Register DefReg, MachineInstr *CurMI, + SmallVector &Uses); + + /// Helper function that finds the shortest instruction path in \p CurMMB's + /// loop that includes \p CurMBB and starts from the loop header and ends at + /// the earliest loop latch. \Returns the path cost and the earliest latch + /// MBB. + std::pair + getLoopDistanceAndExitingLatch(MachineBasicBlock *CurMBB) const; + /// Returns the shortest ditance from ShortestPathTable. + double getShortestDistance(MachineBasicBlock *FromMBB, + MachineBasicBlock *ToMBB) const { + auto It = ShortestPathTable.find({FromMBB, ToMBB}); + if (It == ShortestPathTable.end()) + return std::numeric_limits::max(); + return It->second; + } +}; + +class AMDGPUNextUseAnalysisPass : public MachineFunctionPass { + const MachineLoopInfo *MLI = nullptr; + +public: + static char ID; + + AMDGPUNextUseAnalysisPass() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &) override; + + StringRef getPassName() const override { return "Next Use Analysis"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + AU.addPreserved(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUNEXTUSEANALYSIS_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 8a831f7915882..8c452d443f3bf 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -538,6 +538,15 @@ static cl::opt EnableUniformIntrinsicCombine( cl::desc("Enable/Disable the Uniform Intrinsic Combine Pass"), cl::init(true), cl::Hidden); +static cl::opt OptNextUseAnalysis("enable-next-use-analysis", + cl::desc("Enable next-use analysis"), + cl::init(true), cl::Hidden); + +static cl::opt + OptEarlyRegisterSpilling("enable-early-register-spilling", + cl::desc("Enabl early register spilling"), + cl::init(true), cl::Hidden); + extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine X(getTheR600Target()); @@ -570,6 +579,8 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeSIShrinkInstructionsLegacyPass(*PR); initializeSIOptimizeExecMaskingPreRALegacyPass(*PR); initializeSIOptimizeVGPRLiveRangeLegacyPass(*PR); + initializeAMDGPUNextUseAnalysisPassPass(*PR); + initializeAMDGPUEarlyRegisterSpillingPass(*PR); initializeSILoadStoreOptimizerLegacyPass(*PR); initializeAMDGPUCtorDtorLoweringLegacyPass(*PR); initializeAMDGPUAlwaysInlinePass(*PR); @@ -1602,6 +1613,12 @@ void GCNPassConfig::addOptimizedRegAlloc() { if (OptVGPRLiveRange) insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeLegacyID); + if (OptNextUseAnalysis) + insertPass(&LiveVariablesID, &AMDGPUNextUseAnalysisID); + + if (OptEarlyRegisterSpilling) + insertPass(&AMDGPUNextUseAnalysisID, &AMDGPUEarlyRegisterSpillingID); + // This must be run immediately after phi elimination and before // TwoAddressInstructions, otherwise the processing of the tied operand of // SI_ELSE will introduce a copy of the tied operand source after the else. diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 782cbfa76e6e9..bb54b69eff89e 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -54,6 +54,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUCodeGenPrepare.cpp AMDGPUCombinerHelper.cpp AMDGPUCtorDtorLowering.cpp + AMDGPUEarlyRegisterSpilling.cpp AMDGPUExportClustering.cpp AMDGPUExportKernelRuntimeHandles.cpp AMDGPUFrameLowering.cpp @@ -90,6 +91,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUMCResourceInfo.cpp AMDGPUMarkLastScratchLoad.cpp AMDGPUMIRFormatter.cpp + AMDGPUNextUseAnalysis.cpp AMDGPUPerfHintAnalysis.cpp AMDGPUPostLegalizerCombiner.cpp AMDGPUPreLegalizerCombiner.cpp diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 8364e680bc8c7..b1094cce85fc9 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -356,9 +356,15 @@ ; GCN-O1-NEXT: Live Variable Analysis ; GCN-O1-NEXT: MachineDominator Tree Construction ; GCN-O1-NEXT: SI Optimize VGPR LiveRange +; GCN-O1-NEXT: Next Use Analysis +; GCN-O1-NEXT: MachineDominator Tree Construction +; GCN-O1-NEXT: Slot index numbering +; GCN-O1-NEXT: Live Interval Analysis +; GCN-O1-NEXT: AMDGPU Early Register Spilling ; GCN-O1-NEXT: Eliminate PHI nodes for register allocation ; GCN-O1-NEXT: SI Lower control flow pseudo instructions ; GCN-O1-NEXT: Two-Address instruction pass +; GCN-O1-NEXT: MachineDominator Tree Construction ; GCN-O1-NEXT: Slot index numbering ; GCN-O1-NEXT: Live Interval Analysis ; GCN-O1-NEXT: Machine Natural Loop Construction @@ -677,9 +683,15 @@ ; GCN-O1-OPTS-NEXT: Remove unreachable machine basic blocks ; GCN-O1-OPTS-NEXT: Live Variable Analysis ; GCN-O1-OPTS-NEXT: SI Optimize VGPR LiveRange +; GCN-O1-OPTS-NEXT: Next Use Analysis +; GCN-O1-OPTS-NEXT: MachineDominator Tree Construction +; GCN-O1-OPTS-NEXT: Slot index numbering +; GCN-O1-OPTS-NEXT: Live Interval Analysis +; GCN-O1-OPTS-NEXT: AMDGPU Early Register Spilling ; GCN-O1-OPTS-NEXT: Eliminate PHI nodes for register allocation ; GCN-O1-OPTS-NEXT: SI Lower control flow pseudo instructions ; GCN-O1-OPTS-NEXT: Two-Address instruction pass +; GCN-O1-OPTS-NEXT: MachineDominator Tree Construction ; GCN-O1-OPTS-NEXT: Slot index numbering ; GCN-O1-OPTS-NEXT: Live Interval Analysis ; GCN-O1-OPTS-NEXT: Machine Natural Loop Construction @@ -1003,9 +1015,15 @@ ; GCN-O2-NEXT: Remove unreachable machine basic blocks ; GCN-O2-NEXT: Live Variable Analysis ; GCN-O2-NEXT: SI Optimize VGPR LiveRange +; GCN-O2-NEXT: Next Use Analysis +; GCN-O2-NEXT: MachineDominator Tree Construction +; GCN-O2-NEXT: Slot index numbering +; GCN-O2-NEXT: Live Interval Analysis +; GCN-O2-NEXT: AMDGPU Early Register Spilling ; GCN-O2-NEXT: Eliminate PHI nodes for register allocation ; GCN-O2-NEXT: SI Lower control flow pseudo instructions ; GCN-O2-NEXT: Two-Address instruction pass +; GCN-O2-NEXT: MachineDominator Tree Construction ; GCN-O2-NEXT: Slot index numbering ; GCN-O2-NEXT: Live Interval Analysis ; GCN-O2-NEXT: Machine Natural Loop Construction @@ -1343,9 +1361,15 @@ ; GCN-O3-NEXT: Remove unreachable machine basic blocks ; GCN-O3-NEXT: Live Variable Analysis ; GCN-O3-NEXT: SI Optimize VGPR LiveRange +; GCN-O3-NEXT: Next Use Analysis +; GCN-O3-NEXT: MachineDominator Tree Construction +; GCN-O3-NEXT: Slot index numbering +; GCN-O3-NEXT: Live Interval Analysis +; GCN-O3-NEXT: AMDGPU Early Register Spilling ; GCN-O3-NEXT: Eliminate PHI nodes for register allocation ; GCN-O3-NEXT: SI Lower control flow pseudo instructions ; GCN-O3-NEXT: Two-Address instruction pass +; GCN-O3-NEXT: MachineDominator Tree Construction ; GCN-O3-NEXT: Slot index numbering ; GCN-O3-NEXT: Live Interval Analysis ; GCN-O3-NEXT: Machine Natural Loop Construction diff --git a/llvm/test/CodeGen/AMDGPU/test_ers_basic_case.ll b/llvm/test/CodeGen/AMDGPU/test_ers_basic_case.ll new file mode 100644 index 0000000000000..9b6a1e7041860 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/test_ers_basic_case.ll @@ -0,0 +1,112 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -stop-after=amdgpu-early-register-spilling -verify-machineinstrs -print-after=amdgpu-early-register-spilling -max-vgprs=10 < %s 2>&1 | FileCheck %s + +; +; bb.0.entry +; / | +; bb.3.bb2 | +; \ | +; bb.1.Flow +; / | +; bb.2.bb1 | +; \ | +; bb.4.exit +define amdgpu_ps i64 @test(ptr addrspace(3) %p1, ptr addrspace(3) %p2, i1 %cond1, i64 %val) { + ; CHECK-LABEL: name: test + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 1, [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 1, [[V_AND_B32_e64_]], implicit $exec + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[COPY]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF [[V_CMP_NE_U32_e64_]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.Flow: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:vreg_64 = PHI undef %12:vreg_64, %bb.0, %4, %bb.3 + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY4]], %bb.0, undef %67:vgpr_32, %bb.3 + ; CHECK-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE [[SI_IF]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.bb1: + ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DS_READ_U8_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[PHI1]], 0, 0, implicit $exec :: (load (s8) from %ir.p1, addrspace 3) + ; CHECK-NEXT: [[DS_READ_U8_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[PHI1]], 1, 0, implicit $exec :: (load (s8) from %ir.p1 + 1, addrspace 3) + ; CHECK-NEXT: [[DS_READ_U8_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[PHI1]], 2, 0, implicit $exec :: (load (s8) from %ir.p1 + 2, addrspace 3) + ; CHECK-NEXT: [[DS_READ_U8_gfx9_3:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[PHI1]], 3, 0, implicit $exec :: (load (s8) from %ir.p1 + 3, addrspace 3) + ; CHECK-NEXT: [[DS_READ_U8_gfx9_4:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[PHI1]], 4, 0, implicit $exec :: (load (s8) from %ir.p1 + 4, addrspace 3) + ; CHECK-NEXT: [[DS_READ_U8_gfx9_5:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[PHI1]], 5, 0, implicit $exec :: (load (s8) from %ir.p1 + 5, addrspace 3) + ; CHECK-NEXT: [[DS_READ_U8_gfx9_6:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[PHI1]], 6, 0, implicit $exec :: (load (s8) from %ir.p1 + 6, addrspace 3) + ; CHECK-NEXT: [[DS_READ_U8_gfx9_7:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[PHI1]], 7, 0, implicit $exec :: (load (s8) from %ir.p1 + 7, addrspace 3) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[DS_READ_U8_gfx9_5]], 8, [[DS_READ_U8_gfx9_4]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[DS_READ_U8_gfx9_7]], 8, [[DS_READ_U8_gfx9_6]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_1]], 16, [[V_LSHL_OR_B32_e64_]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_3:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[DS_READ_U8_gfx9_1]], 8, [[DS_READ_U8_gfx9_]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_4:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[DS_READ_U8_gfx9_3]], 8, [[DS_READ_U8_gfx9_2]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_5:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_4]], 16, [[V_LSHL_OR_B32_e64_3]], implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_LSHL_OR_B32_e64_5]], %subreg.sub0, [[V_LSHL_OR_B32_e64_2]], %subreg.sub1 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; CHECK-NEXT: S_BRANCH %bb.4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3.bb2: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DS_READ_U8_gfx9_8:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY3]], 0, 0, implicit $exec :: (load (s8) from %ir.p2, addrspace 3) + ; CHECK-NEXT: [[DS_READ_U8_gfx9_9:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY3]], 1, 0, implicit $exec :: (load (s8) from %ir.p2 + 1, addrspace 3) + ; CHECK-NEXT: [[DS_READ_U8_gfx9_10:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY3]], 2, 0, implicit $exec :: (load (s8) from %ir.p2 + 2, addrspace 3) + ; CHECK-NEXT: [[DS_READ_U8_gfx9_11:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY3]], 3, 0, implicit $exec :: (load (s8) from %ir.p2 + 3, addrspace 3) + ; CHECK-NEXT: [[DS_READ_U8_gfx9_12:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY3]], 4, 0, implicit $exec :: (load (s8) from %ir.p2 + 4, addrspace 3) + ; CHECK-NEXT: [[DS_READ_U8_gfx9_13:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY3]], 5, 0, implicit $exec :: (load (s8) from %ir.p2 + 5, addrspace 3) + ; CHECK-NEXT: [[DS_READ_U8_gfx9_14:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY3]], 6, 0, implicit $exec :: (load (s8) from %ir.p2 + 6, addrspace 3) + ; CHECK-NEXT: [[DS_READ_U8_gfx9_15:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY3]], 7, 0, implicit $exec :: (load (s8) from %ir.p2 + 7, addrspace 3) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_6:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[DS_READ_U8_gfx9_13]], 8, [[DS_READ_U8_gfx9_12]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_7:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[DS_READ_U8_gfx9_15]], 8, [[DS_READ_U8_gfx9_14]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_8:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_7]], 16, [[V_LSHL_OR_B32_e64_6]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_9:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[DS_READ_U8_gfx9_9]], 8, [[DS_READ_U8_gfx9_8]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_10:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[DS_READ_U8_gfx9_11]], 8, [[DS_READ_U8_gfx9_10]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_11:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_10]], 16, [[V_LSHL_OR_B32_e64_9]], implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_LSHL_OR_B32_e64_11]], %subreg.sub0, [[V_LSHL_OR_B32_e64_8]], %subreg.sub1 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4.exit: + ; CHECK-NEXT: [[PHI2:%[0-9]+]]:vreg_64 = PHI [[PHI]], %bb.1, [[COPY5]], %bb.2 + ; CHECK-NEXT: SI_END_CF [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[PHI2]].sub0, [[COPY1]], 0, implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: %59:vgpr_32, dead $sgpr_null = V_ADDC_U32_e64 [[PHI2]].sub1, [[SI_SPILL_V32_RESTORE]], [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_ADD_CO_U32_e64_]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 %59, implicit $exec + ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; CHECK-NEXT: SI_RETURN_TO_EPILOG killed $sgpr0, killed $sgpr1 +entry: +; entry +; / \ +; bb1 bb2 +; \ / +; exit + br i1 %cond1, label %bb1, label %bb2 + +bb1: + %ld1 = load i64, ptr addrspace(3) %p1, align 1 + br label %exit + +bb2: + %ld2 = load i64, ptr addrspace(3) %p2, align 1 + br label %exit + +exit: + %phi = phi i64 [ %ld1, %bb1 ], [ %ld2, %bb2 ] + %add = add i64 %phi, %val + ret i64 %add +} diff --git a/llvm/test/CodeGen/AMDGPU/test_ers_do_not_spill_restore_inside_loop.ll b/llvm/test/CodeGen/AMDGPU/test_ers_do_not_spill_restore_inside_loop.ll new file mode 100644 index 0000000000000..af29219e0cb67 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/test_ers_do_not_spill_restore_inside_loop.ll @@ -0,0 +1,239 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -stop-after=amdgpu-early-register-spilling -verify-machineinstrs -print-after=amdgpu-early-register-spilling -max-vgprs=13 < %s 2>&1 | FileCheck %s + +; +; bb.0.entry +; | +; bb.1.loop1.header<---+ +; / | | +;bb.4.loop1.latch2 | | +; \ | | +; bb.2.Flow | +; / | | +;bb.3.loop1.latch1 | | +; \ | | +; bb.5.Flow1-------+ +; | +; bb.6.bb +; | +; bb.7.loop2<------+ +; | | +; +------------+ +; | +; bb.8.exit +; +define amdgpu_ps void @test(ptr addrspace(1) %p1, ptr addrspace(1) %p2, ptr addrspace(1) %p3, ptr addrspace(1) %p4, ptr addrspace(1) %p5, i1 %cond, i32 %TC1, i32 %TC2, i32 %TC3) { + ; CHECK-LABEL: name: test + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr13 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr12 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr7 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr6 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[COPY]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[COPY2]], %stack.2, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; CHECK-NEXT: SI_SPILL_V64_SAVE [[REG_SEQUENCE]], %stack.1, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY10]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY12]], %subreg.sub1 + ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 1, [[COPY3]], implicit $exec + ; CHECK-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 1, [[V_AND_B32_e64_]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE4]], 0, 0, implicit $exec :: (load (s8) from %ir.p1, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE4]], 1, 0, implicit $exec :: (load (s8) from %ir.p1 + 1, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE1]], 8, [[GLOBAL_LOAD_UBYTE]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE2:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE4]], 2, 0, implicit $exec :: (load (s8) from %ir.p1 + 2, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE3:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE4]], 3, 0, implicit $exec :: (load (s8) from %ir.p1 + 3, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE3]], 8, [[GLOBAL_LOAD_UBYTE2]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_1]], 16, [[V_LSHL_OR_B32_e64_]], implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 10, implicit $exec + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: SI_SPILL_V64_SAVE [[REG_SEQUENCE1]], %stack.3, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.3, align 4, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.loop1.header: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %24, %bb.5 + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_1]], %bb.0, %20, %bb.5 + ; CHECK-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, %19, %bb.5 + ; CHECK-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[V_LSHL_OR_B32_e64_2]], %bb.0, %22, %bb.5 + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; CHECK-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF [[V_CMP_NE_U32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.Flow: + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI4:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_1]], %bb.1, %122, %bb.4 + ; CHECK-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI undef %77:vgpr_32, %bb.1, %16, %bb.4 + ; CHECK-NEXT: [[PHI6:%[0-9]+]]:vgpr_32 = PHI undef %77:vgpr_32, %bb.1, %15, %bb.4 + ; CHECK-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI undef %77:vgpr_32, %bb.1, %17, %bb.4 + ; CHECK-NEXT: [[PHI8:%[0-9]+]]:vgpr_32 = PHI [[PHI2]], %bb.1, undef %129:vgpr_32, %bb.4 + ; CHECK-NEXT: [[PHI9:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, undef %131:vgpr_32, %bb.4 + ; CHECK-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE [[SI_IF]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3.loop1.latch1: + ; CHECK-NEXT: successors: %bb.5(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_SUB_U32_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_LSHL_OR_B32_e64_2]], [[PHI8]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 1, [[PHI9]], 0, implicit $exec + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 [[V_ADD_U32_e64_]], [[SI_SPILL_V32_RESTORE]], implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[PHI4]], $exec_lo, implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_GE_U32_e64_]], $exec_lo, implicit-def dead $scc + ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_ANDN2_B32_]], [[S_AND_B32_]], implicit-def dead $scc + ; CHECK-NEXT: S_BRANCH %bb.5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4.loop1.latch2: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (load (s32) from %ir.p2, addrspace 1) + ; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 1, [[PHI2]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[GLOBAL_LOAD_DWORD]], [[V_ADD_U32_e64_1]], 0, implicit $exec + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 [[V_ADD_U32_e64_1]], [[COPY1]], implicit $exec + ; CHECK-NEXT: [[S_ORN2_B32_:%[0-9]+]]:sreg_32 = S_ORN2_B32 [[V_CMP_GE_U32_e64_1]], $exec_lo, implicit-def dead $scc + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5.Flow2: + ; CHECK-NEXT: successors: %bb.6(0x04000000), %bb.1(0x7c000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI10:%[0-9]+]]:sreg_32 = PHI [[PHI4]], %bb.2, [[S_OR_B32_]], %bb.3 + ; CHECK-NEXT: [[PHI11:%[0-9]+]]:vgpr_32 = PHI [[PHI5]], %bb.2, [[V_MOV_B32_e32_3]], %bb.3 + ; CHECK-NEXT: [[PHI12:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_2]], %bb.2, [[V_ADD_U32_e64_]], %bb.3 + ; CHECK-NEXT: [[PHI13:%[0-9]+]]:vgpr_32 = PHI [[PHI6]], %bb.2, [[V_LSHL_OR_B32_e64_2]], %bb.3 + ; CHECK-NEXT: [[PHI14:%[0-9]+]]:vgpr_32 = PHI [[PHI7]], %bb.2, [[V_SUB_U32_e64_]], %bb.3 + ; CHECK-NEXT: SI_END_CF [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[PHI10]], [[PHI]], implicit-def dead $scc + ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.6 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.6.bb1: + ; CHECK-NEXT: successors: %bb.7(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[PHI14]], [[V_LSHL_OR_B32_e64_2]], implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[V_MUL_LO_U32_e64_]], 0, 0, implicit $exec :: (store (s32) into %ir.p3, addrspace 1) + ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE1:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.7.loop2: + ; CHECK-NEXT: successors: %bb.8(0x04000000), %bb.7(0x7c000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI15:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_2]], %bb.6, %34, %bb.7 + ; CHECK-NEXT: [[PHI16:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_2]], %bb.6, %33, %bb.7 + ; CHECK-NEXT: [[PHI17:%[0-9]+]]:vgpr_32 = PHI [[PHI14]], %bb.6, [[V_LSHL_OR_B32_e64_2]], %bb.7 + ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI16]], 2, implicit-def dead $scc + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_2:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 [[S_ADD_I32_]], [[SI_SPILL_V32_RESTORE1]], implicit $exec + ; CHECK-NEXT: [[SI_IF_BREAK1:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[V_CMP_GE_U32_e64_2]], [[PHI15]], implicit-def dead $scc + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]], implicit $exec + ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK1]], %bb.7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.8.exit: + ; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI17]], [[COPY14]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_3]], [[PHI14]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 [[V_ADD_U32_e64_4]], [[V_MUL_LO_U32_e64_]], [[V_ADD_U32_e64_3]], implicit $exec + ; CHECK-NEXT: [[V_ADD3_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 [[V_LSHL_OR_B32_e64_2]], [[V_ADD3_U32_e64_]], 100, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_ADD3_U32_e64_1]], [[PHI17]], implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_1:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_ADD_U32_e64_4]], [[PHI13]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_MUL_LO_U32_e64_1]], [[V_SUB_U32_e64_1]], implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.3, align 4, addrspace 5) + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[SI_SPILL_V64_RESTORE]], 0, 0, implicit $exec :: (load (s32) from %ir.p4, addrspace 1) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_MUL_LO_U32_e64_2]], [[GLOBAL_LOAD_DWORD1]], implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_2:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_ADD_U32_e64_4]], [[GLOBAL_LOAD_DWORD1]], 0, implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE1:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD2:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[SI_SPILL_V64_RESTORE1]], 0, 0, implicit $exec :: (load (s32) from %ir.p5, addrspace 1) + ; CHECK-NEXT: [[V_SUB_U32_e64_3:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_SUB_U32_e64_2]], [[GLOBAL_LOAD_DWORD2]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD3_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 [[V_MUL_LO_U32_e64_3]], [[V_SUB_U32_e64_3]], [[PHI3]], implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[SI_SPILL_V64_RESTORE]], [[V_ADD3_U32_e64_2]], 0, 0, implicit $exec :: (store (s32) into %ir.p4, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[SI_SPILL_V64_RESTORE1]], [[V_MUL_LO_U32_e64_]], 0, 0, implicit $exec :: (store (s32) into %ir.p5, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 +entry: +; entry +; | +; +-------->loop1.header<--------+ +; | / \ | +; +--loop1.latch1 loop1.latch2--+ +; \ / +; bb1 +; | +; +<-----+ +; loop2 | +; +------+ +; | +; exit + %ld1 = load i32, ptr addrspace(1) %p1, align 1 + %add1 = add i32 %ld1, 100 + br label %loop1.header + +loop1.header: + %phi.inc1 = phi i32 [ 0, %entry ], [ %inc1, %loop1.latch1 ], [ 0, %loop1.latch2 ] + %phi.inc2 = phi i32 [ 10, %entry ], [ 0, %loop1.latch1 ], [ %inc2, %loop1.latch2 ] + %phi1 = phi i32 [ %ld1, %entry ], [ %sub, %loop1.latch1 ], [ %add2, %loop1.latch2 ] + br i1 %cond, label %loop1.latch1, label %loop1.latch2 + +loop1.latch1: + %sub = sub i32 %ld1, %phi.inc2 + %inc1 = add i32 %phi.inc1, 1 + %cond1 = icmp ult i32 %inc1, %TC1 + br i1 %cond1, label %loop1.header, label %bb1 + +loop1.latch2: + %ld2 = load i32, ptr addrspace(1) %p2, align 4 + %inc2 = add i32 %phi.inc2, 1 + %add2 = add i32 %ld2, %inc2 + %cond2 = icmp ult i32 %inc2, %TC2 + br i1 %cond2, label %loop1.header, label %bb1 + +bb1: + %phi2 = phi i32 [ %sub, %loop1.latch1 ], [ %add2, %loop1.latch2 ] + %ld3 = phi i32 [ %ld1, %loop1.latch1 ], [ %ld2, %loop1.latch2 ] + %mul = mul i32 %phi2, %ld1 + store i32 %mul, ptr addrspace(1) %p3 + br label %loop2 + +loop2: + %phi.inc3 = phi i32 [ 0, %bb1 ], [ %inc3, %loop2 ] + %phi3 = phi i32 [ %phi2, %bb1 ], [ %ld1, %loop2 ] + %inc3 = add i32 %phi.inc3, 2 + %add3 = add i32 %phi3, %inc3 + %cond3 = icmp ult i32 %inc3, %TC3 + br i1 %cond3, label %loop2, label %exit + +exit: + %add4 = add i32 %add3, %phi2 + %add5 = add i32 %add4, %mul + %add6 = add i32 %add5, %add3 + %add7 = add i32 %add6, %add1 + %mul2 = mul i32 %add7, %phi3 + %sub1 = sub i32 %add4, %ld3 + %mul3 = mul i32 %mul2, %sub1 + %ld4 = load i32, ptr addrspace(1) %p4, align 4 + %mul4 = mul i32 %mul3, %ld4 + %sub2 = sub i32 %add4, %ld4 + %ld5 = load i32, ptr addrspace(1) %p5, align 4 + %sub3 = sub i32 %sub2, %ld5 + %add8 = add i32 %mul4, %sub3 + %add9 = add i32 %add8, %phi1 + store i32 %add9, ptr addrspace(1) %p4, align 4 + store i32 %mul, ptr addrspace(1) %p5, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/test_ers_emit_restore_in_common_dominator.ll b/llvm/test/CodeGen/AMDGPU/test_ers_emit_restore_in_common_dominator.ll new file mode 100644 index 0000000000000..40759a2a8065f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/test_ers_emit_restore_in_common_dominator.ll @@ -0,0 +1,283 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -stop-after=amdgpu-early-register-spilling -verify-machineinstrs -print-after=amdgpu-early-register-spilling -max-vgprs=8 < %s 2>&1 | FileCheck %s + +; +; bb.0.entry +; / | +; bb.1.bb1 | +; \ | +; bb.2.bb2 +; / | +; bb.5.bb4 | +; \ | +; bb.3.Flow3 +; / | +; bb.4.bb3 | +; \ | +; bb.6.bb5 +; / | +; bb.12.bb7 | +; \ | +; bb.7.Flow2 +; / | +; bb.8.bb6 | +; / | | +;bb.11.bb9 | | +; \ | | +; bb.9.Flow | +; / | | +;bb.10.bb8 | | +; \ | | +; bb.13.Flow1 | +; \ | +; bb.14.exit +; +define amdgpu_ps i32 @test(ptr addrspace(1) %p1, ptr addrspace(3) %p2, i1 %cond1, i1 %cond2) { + ; CHECK-LABEL: name: test + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 1, [[COPY1]], implicit $exec + ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 1, [[V_AND_B32_e64_]], implicit $exec + ; CHECK-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 1, [[COPY]], implicit $exec + ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 1, [[V_AND_B32_e64_1]], implicit $exec + ; CHECK-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_1]], -1, implicit-def dead $scc + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8) from %ir.p1, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 1, 0, implicit $exec :: (load (s8) from %ir.p1 + 1, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE1]], 8, [[GLOBAL_LOAD_UBYTE]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE2:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 2, 0, implicit $exec :: (load (s8) from %ir.p1 + 2, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE3:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 3, 0, implicit $exec :: (load (s8) from %ir.p1 + 3, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE3]], 8, [[GLOBAL_LOAD_UBYTE2]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_1]], 16, [[V_LSHL_OR_B32_e64_]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE4:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 12, 0, implicit $exec :: (load (s8) from %ir.gep1, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE5:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 13, 0, implicit $exec :: (load (s8) from %ir.gep1 + 1, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE6:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 14, 0, implicit $exec :: (load (s8) from %ir.gep1 + 2, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE7:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 15, 0, implicit $exec :: (load (s8) from %ir.gep1 + 3, addrspace 1) + ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 100, [[V_LSHL_OR_B32_e64_2]], 0, implicit $exec + ; CHECK-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.bb1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 100, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.bb2: + ; CHECK-NEXT: successors: %bb.5(0x40000000), %bb.3(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_ADD_U32_e64_]], %bb.0, [[V_MOV_B32_e32_]], %bb.1 + ; CHECK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[V_LSHL_OR_B32_e64_2]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: [[DS_READ_U8_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY2]], 0, 0, implicit $exec :: (load (s8) from %ir.p2, addrspace 3) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[GLOBAL_LOAD_UBYTE7]], %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) + ; CHECK-NEXT: [[DS_READ_U8_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY2]], 1, 0, implicit $exec :: (load (s8) from %ir.p2 + 1, addrspace 3) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[GLOBAL_LOAD_UBYTE6]], %stack.2, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) + ; CHECK-NEXT: [[DS_READ_U8_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY2]], 2, 0, implicit $exec :: (load (s8) from %ir.p2 + 2, addrspace 3) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[GLOBAL_LOAD_UBYTE4]], %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) + ; CHECK-NEXT: [[DS_READ_U8_gfx9_3:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY2]], 3, 0, implicit $exec :: (load (s8) from %ir.p2 + 3, addrspace 3) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[COPY2]], %stack.4, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_3:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[DS_READ_U8_gfx9_1]], 8, [[DS_READ_U8_gfx9_]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_4:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[DS_READ_U8_gfx9_3]], 8, [[DS_READ_U8_gfx9_2]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_5:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_4]], 16, [[V_LSHL_OR_B32_e64_3]], implicit $exec + ; CHECK-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF [[S_XOR_B32_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3.Flow3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.6(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %64:vgpr_32, %bb.2, %11, %bb.5 + ; CHECK-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[PHI]], %bb.2, undef %119:vgpr_32, %bb.5 + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_6:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE5]], 8, [[SI_SPILL_V32_RESTORE]], implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE1:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE2:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_7:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[SI_SPILL_V32_RESTORE1]], 8, [[SI_SPILL_V32_RESTORE2]], implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE3:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE [[SI_IF1]], %bb.6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4.bb3: + ; CHECK-NEXT: successors: %bb.6(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %74:vreg_64, $sgpr_null = V_MAD_U64_U32_e64 [[SI_SPILL_V32_RESTORE3]], [[PHI2]], 1000, 0, implicit $exec + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY %74.sub0 + ; CHECK-NEXT: S_BRANCH %bb.6 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5.bb4: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], [[V_LSHL_OR_B32_e64_5]], 0, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.6.bb5: + ; CHECK-NEXT: successors: %bb.12(0x40000000), %bb.7(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.3, [[COPY5]], %bb.4 + ; CHECK-NEXT: SI_END_CF [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_8:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_7]], 16, [[V_LSHL_OR_B32_e64_6]], implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE4:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5) + ; CHECK-NEXT: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[SI_SPILL_V32_RESTORE4]], 12, 0, implicit $exec :: (load (s32) from %ir.gep2, align 8, addrspace 3) + ; CHECK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DS_READ_B32_gfx9_]], [[PHI3]], 0, implicit $exec + ; CHECK-NEXT: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def dead $scc + ; CHECK-NEXT: [[S_XOR_B32_2:%[0-9]+]]:sreg_32 = S_XOR_B32 [[S_XOR_B32_1]], -1, implicit-def dead $scc + ; CHECK-NEXT: [[SI_IF2:%[0-9]+]]:sreg_32 = SI_IF [[S_XOR_B32_2]], %bb.7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.12 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.7.Flow2: + ; CHECK-NEXT: successors: %bb.8(0x40000000), %bb.14(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI4:%[0-9]+]]:vgpr_32 = PHI undef %76:vgpr_32, %bb.6, %23, %bb.12 + ; CHECK-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[V_LSHL_OR_B32_e64_8]], %bb.6, undef %121:vgpr_32, %bb.12 + ; CHECK-NEXT: [[PHI6:%[0-9]+]]:vgpr_32 = PHI [[V_ADD_U32_e64_2]], %bb.6, undef %123:vgpr_32, %bb.12 + ; CHECK-NEXT: [[SI_ELSE1:%[0-9]+]]:sreg_32 = SI_ELSE [[SI_IF2]], %bb.14, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.8.bb6: + ; CHECK-NEXT: successors: %bb.11(0x40000000), %bb.9(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def dead $scc + ; CHECK-NEXT: [[S_XOR_B32_3:%[0-9]+]]:sreg_32 = S_XOR_B32 [[S_AND_B32_]], -1, implicit-def dead $scc + ; CHECK-NEXT: [[SI_IF3:%[0-9]+]]:sreg_32 = SI_IF [[S_XOR_B32_3]], %bb.9, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.9.Flow: + ; CHECK-NEXT: successors: %bb.10(0x40000000), %bb.13(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI undef %106:vgpr_32, %bb.8, %22, %bb.11 + ; CHECK-NEXT: [[PHI8:%[0-9]+]]:vgpr_32 = PHI [[PHI5]], %bb.8, undef %125:vgpr_32, %bb.11 + ; CHECK-NEXT: [[PHI9:%[0-9]+]]:vgpr_32 = PHI [[PHI6]], %bb.8, undef %127:vgpr_32, %bb.11 + ; CHECK-NEXT: [[SI_ELSE2:%[0-9]+]]:sreg_32 = SI_ELSE [[SI_IF3]], %bb.13, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.10.bb8: + ; CHECK-NEXT: successors: %bb.13(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI8]], [[PHI9]], 0, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.13 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.11.bb9: + ; CHECK-NEXT: successors: %bb.9(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[PHI5]], [[PHI6]], implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.9 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.12.bb7: + ; CHECK-NEXT: successors: %bb.7(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_SUB_U32_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[DS_READ_B32_gfx9_]], [[V_ADD_U32_e64_2]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_SUB_U32_e64_]], [[V_LSHL_OR_B32_e64_5]], implicit $exec + ; CHECK-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[SI_SPILL_V32_RESTORE3]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_RCP_IFLAG_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_IFLAG_F32_e64 0, [[V_CVT_F32_U32_e64_]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_MUL_F32_e64 0, 1333788670, 0, [[V_RCP_IFLAG_F32_e64_]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_CVT_U32_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, [[V_MUL_F32_e64_]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_1:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 0, [[SI_SPILL_V32_RESTORE3]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_SUB_U32_e64_1]], [[V_CVT_U32_F32_e64_]], implicit $exec + ; CHECK-NEXT: [[V_MUL_HI_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[V_CVT_U32_F32_e64_]], [[V_MUL_LO_U32_e64_2]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_CVT_U32_F32_e64_]], [[V_MUL_HI_U32_e64_]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_HI_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[V_MUL_LO_U32_e64_1]], [[V_ADD_U32_e64_4]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_MUL_HI_U32_e64_1]], [[SI_SPILL_V32_RESTORE3]], implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_2:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_MUL_LO_U32_e64_1]], [[V_MUL_LO_U32_e64_3]], 0, implicit $exec + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_GE_U32_e64 [[V_SUB_U32_e64_2]], [[SI_SPILL_V32_RESTORE3]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 1, [[V_MUL_HI_U32_e64_1]], 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MUL_HI_U32_e64_1]], 0, [[V_ADD_U32_e64_5]], [[V_CMP_GE_U32_e64_]], implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_3:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_SUB_U32_e64_2]], [[SI_SPILL_V32_RESTORE3]], 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_SUB_U32_e64_2]], 0, [[V_SUB_U32_e64_3]], [[V_CMP_GE_U32_e64_]], implicit $exec + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_GE_U32_e64 [[V_CNDMASK_B32_e64_1]], [[SI_SPILL_V32_RESTORE3]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 1, [[V_CNDMASK_B32_e64_]], 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_CNDMASK_B32_e64_]], 0, [[V_ADD_U32_e64_6]], [[V_CMP_GE_U32_e64_1]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_CNDMASK_B32_e64_2]], [[V_LSHL_OR_B32_e64_8]], 0, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.13.Flow1: + ; CHECK-NEXT: successors: %bb.14(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI10:%[0-9]+]]:vgpr_32 = PHI [[PHI7]], %bb.9, [[V_ADD_U32_e64_3]], %bb.10 + ; CHECK-NEXT: SI_END_CF [[SI_ELSE2]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.14.exit: + ; CHECK-NEXT: [[PHI11:%[0-9]+]]:vgpr_32 = PHI [[PHI4]], %bb.7, [[PHI10]], %bb.13 + ; CHECK-NEXT: SI_END_CF [[SI_ELSE1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 [[V_LSHL_OR_B32_e64_5]], [[PHI11]], 100, implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_ADD3_U32_e64_]], implicit $exec + ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; CHECK-NEXT: SI_RETURN_TO_EPILOG killed $sgpr0 +entry: +; entry +; / \ +; bb1 | +; \ | +; BB2 +; / \ +; BB3 BB4 +; \ / +; BB5 +; / \ +; BB6 BB7 +; / \ | +; BB8 BB9 | +; \ | | +; \ | / +; exit + %ld1 = load i32, ptr addrspace(1) %p1, align 1 + %gep1 = getelementptr inbounds i32, ptr addrspace(1) %p1, i64 3 + %ld2 = load i32, ptr addrspace(1) %gep1, align 1 + %add1 = add i32 %ld1, 100 + br i1 %cond1, label %bb1, label %bb2 + +bb1: + br label %bb2 + +bb2: + %phi0 = phi i32 [ 100, %bb1 ], [ %add1, %entry ] + %ld3 = load i32, ptr addrspace(3) %p2, align 1 + %add2 = add i32 %ld3, 100 + br i1 %cond2, label %bb3, label %bb4 + +bb3: + %mul1 = mul i32 %ld1, %phi0 + %add3 = add i32 %mul1, 1000 + br label %bb5 + +bb4: + %add4 = add i32 %add2, %ld1 + br label %bb5 + +bb5: + %phi1 = phi i32 [ %add3, %bb3 ], [ %add4, %bb4] + %gep2 = getelementptr inbounds i32, ptr addrspace(3) %p2, i64 3 + %ld4 = load i32, ptr addrspace(3) %gep2, align 8 + %add5 = add i32 %ld4, %phi1 + %xor = xor i1 %cond1, %cond2 + br i1 %xor, label %bb6, label %bb7 + +bb6: + %and = and i1 %cond1, %cond2 + br i1 %and, label %bb8, label %bb9 + +bb8: + %add6 = add i32 %ld2, %add5 + br label %exit + +bb9: + %mul2 = mul i32 %ld2, %add5 + br label %exit + +bb7: + %sub1 = sub i32 %ld4, %add5 + %mul3 = mul i32 %sub1, %ld3 + %div = udiv i32 %mul3, %ld1 + %add7 = add i32 %div, %ld2 + br label %exit + +exit: + %phi2 = phi i32 [ %add6, %bb8 ], [ %mul2, %bb9], [ %add7, %bb7 ] + %add8 = add i32 %add2, %phi2 + ret i32 %add8 +} diff --git a/llvm/test/CodeGen/AMDGPU/test_ers_emit_restore_in_loop_preheader1.ll b/llvm/test/CodeGen/AMDGPU/test_ers_emit_restore_in_loop_preheader1.ll new file mode 100644 index 0000000000000..d6a55af8d0562 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/test_ers_emit_restore_in_loop_preheader1.ll @@ -0,0 +1,132 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -stop-after=amdgpu-early-register-spilling -verify-machineinstrs -print-after=amdgpu-early-register-spilling -max-vgprs=10 < %s 2>&1 | FileCheck %s + +; +; bb.0.entry +; | +; +<-----+ +; bb.1.loop | +; +------+ +; | +; bb.2.exit +; +define amdgpu_ps void @test(ptr addrspace(1) %p1, ptr addrspace(1) %p2, ptr addrspace(1) %p3, ptr addrspace(1) %p4, ptr addrspace(1) %p5, i32 %TC) { + ; CHECK-LABEL: name: test + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr7 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr6 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[COPY]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[COPY7]], %stack.2, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK-NEXT: SI_SPILL_V64_SAVE [[REG_SEQUENCE]], %stack.1, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[SI_SPILL_V32_RESTORE]], %subreg.sub1 + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE4]], 0, 0, implicit $exec :: (load (s32) from %ir.p2, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (load (s8) from %ir.p3, addrspace 1) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[GLOBAL_LOAD_DWORD]], %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE3]], 1, 0, implicit $exec :: (load (s8) from %ir.p3 + 1, addrspace 1) + ; CHECK-NEXT: SI_SPILL_V64_SAVE [[REG_SEQUENCE1]], %stack.4, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.4, align 4, addrspace 5) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE1]], 8, [[GLOBAL_LOAD_UBYTE]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE2:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE3]], 2, 0, implicit $exec :: (load (s8) from %ir.p3 + 2, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE3:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE3]], 3, 0, implicit $exec :: (load (s8) from %ir.p3 + 3, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE3]], 8, [[GLOBAL_LOAD_UBYTE2]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_1]], 16, [[V_LSHL_OR_B32_e64_]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE2]], 0, 0, implicit $exec :: (load (s32) from %ir.p4, addrspace 1) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE1:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[GLOBAL_LOAD_DWORD1]], [[SI_SPILL_V32_RESTORE1]], 0, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE4]], [[V_ADD_U32_e64_]], 0, 0, implicit $exec :: (store (s32) into %ir.p2, addrspace 1) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD1]], [[V_ADD_U32_e64_]], implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE3]], [[V_MUL_LO_U32_e64_]], 0, 0, implicit $exec :: (store (s32) into %ir.p3, addrspace 1) + ; CHECK-NEXT: [[V_SUB_U32_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_MUL_LO_U32_e64_]], [[V_ADD_U32_e64_]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_SUB_U32_e64_]], [[SI_SPILL_V32_RESTORE1]], implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[V_MUL_LO_U32_e64_1]], 0, 0, implicit $exec :: (store (s32) into %ir.p4, addrspace 1) + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 100, implicit $exec + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.4, align 4, addrspace 5) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.loop: + ; CHECK-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %7, %bb.1 + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, %5, %bb.1 + ; CHECK-NEXT: [[PHI2:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %6, %bb.1 + ; CHECK-NEXT: [[S_ASHR_I32_:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[PHI2]], 31, implicit-def dead $scc + ; CHECK-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[PHI2]], %subreg.sub0, [[S_ASHR_I32_]], %subreg.sub1 + ; CHECK-NEXT: [[S_LSHL_B64_:%[0-9]+]]:sreg_64 = nsw S_LSHL_B64 [[REG_SEQUENCE5]], 2, implicit-def dead $scc + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[SI_SPILL_V64_RESTORE]].sub0, [[S_LSHL_B64_]].sub0, 0, implicit $exec + ; CHECK-NEXT: %94:vgpr_32, dead $sgpr_null = V_ADDC_U32_e64 [[S_LSHL_B64_]].sub1, [[SI_SPILL_V64_RESTORE]].sub1, [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %94, %subreg.sub1 + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE4:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE6]], 0, 0, implicit $exec :: (load (s8) from %ir.gep, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE5:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE6]], 1, 0, implicit $exec :: (load (s8) from %ir.gep + 1, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_3:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE5]], 8, [[GLOBAL_LOAD_UBYTE4]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE6:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE6]], 2, 0, implicit $exec :: (load (s8) from %ir.gep + 2, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE7:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE6]], 3, 0, implicit $exec :: (load (s8) from %ir.gep + 3, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_4:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE7]], 8, [[GLOBAL_LOAD_UBYTE6]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_5:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_4]], 16, [[V_LSHL_OR_B32_e64_3]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI2]], [[V_LSHL_OR_B32_e64_5]], 0, implicit $exec + ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI2]], 1, implicit-def dead $scc + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 [[S_ADD_I32_]], [[SI_SPILL_V32_RESTORE1]], implicit $exec + ; CHECK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[V_CMP_GE_U32_e64_]], [[PHI]], implicit-def dead $scc + ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.exit: + ; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE7:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_LSHL_OR_B32_e64_2]], %subreg.sub0, undef %87:vgpr_32, %subreg.sub1 + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE2:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) + ; CHECK-NEXT: %72:vreg_64, $sgpr_null = V_MAD_U64_U32_e64 [[SI_SPILL_V32_RESTORE2]], [[PHI1]], [[REG_SEQUENCE7]], 0, implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE1:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[SI_SPILL_V64_RESTORE1]], %72.sub0, 0, 0, implicit $exec :: (store (s32) into %ir.p5, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 +entry: +; entry +; | +; +<----+ +; loop | +; +-----+ +; | +; exit + %ld1 = load i32, ptr addrspace(1) %p2, align 4 + %ld2 = load i32, ptr addrspace(1) %p3, align 1 + %ld3 = load i32, ptr addrspace(1) %p4 + %add1 = add i32 %ld3, %TC + store i32 %add1, ptr addrspace(1) %p2 + %mul1 = mul i32 %ld3, %add1 + store i32 %mul1, ptr addrspace(1) %p3 + %sub1 = sub i32 %mul1, %add1 + %mul2 = mul i32 %sub1, %TC + store i32 %mul2, ptr addrspace(1) %p4 + br label %loop + +loop: + %phi = phi i32 [ 100, %entry ], [ %add, %loop ] + %phi.inc = phi i32 [ 0, %entry ], [ %inc, %loop ] + %sext = sext i32 %phi.inc to i64 + %gep = getelementptr inbounds i32, ptr addrspace(1) %p1, i64 %sext + %ld = load i32, ptr addrspace(1) %gep, align 1 + %add = add i32 %ld, %phi.inc + %inc = add i32 %phi.inc, 1 + %cond = icmp ult i32 %inc, %TC + br i1 %cond, label %loop, label %exit + +exit: + %mul3 = mul i32 %ld1, %phi + %add2 = add i32 %mul3, %ld2 + store i32 %add2, ptr addrspace(1) %p5 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/test_ers_emit_restore_in_loop_preheader2.ll b/llvm/test/CodeGen/AMDGPU/test_ers_emit_restore_in_loop_preheader2.ll new file mode 100644 index 0000000000000..f865458f3826a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/test_ers_emit_restore_in_loop_preheader2.ll @@ -0,0 +1,178 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -stop-after=amdgpu-early-register-spilling -verify-machineinstrs -print-after=amdgpu-early-register-spilling -max-vgprs=10 < %s 2>&1 | FileCheck %s + +; +; bb.0.entry +; | +; +<--------+ +; bb.1.loop1 | +; +---------+ +; | +; bb.2.bb +; | +; +<--------+ +; bb.3.loop2 | +; +---------+ +; | +; bb.4.exit +; +define amdgpu_ps i32 @test(ptr addrspace(1) %p1, ptr addrspace(1) %p2, ptr addrspace(1) %p3, i32 %TC1, i32 %TC2) { + ; CHECK-LABEL: name: test + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr7 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr6 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE2]], 0, 0, implicit $exec :: (load (s8) from %ir.p1, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE2]], 1, 0, implicit $exec :: (load (s8) from %ir.p1 + 1, addrspace 1) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[COPY]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE1]], 8, [[GLOBAL_LOAD_UBYTE]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE2:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE2]], 2, 0, implicit $exec :: (load (s8) from %ir.p1 + 2, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE3:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE2]], 3, 0, implicit $exec :: (load (s8) from %ir.p1 + 3, addrspace 1) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[COPY1]], %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE3]], 8, [[GLOBAL_LOAD_UBYTE2]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_1]], 16, [[V_LSHL_OR_B32_e64_]], implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 100, implicit $exec + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.loop1: + ; CHECK-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_1]], %bb.0, %13, %bb.1 + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_1]], %bb.0, %11, %bb.1 + ; CHECK-NEXT: [[PHI2:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %10, %bb.1 + ; CHECK-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[V_LSHL_OR_B32_e64_2]], %bb.0, %8, %bb.1 + ; CHECK-NEXT: [[PHI4:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, %9, %bb.1 + ; CHECK-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[V_LSHL_OR_B32_e64_2]], %bb.0, %12, %bb.1 + ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI2]], -1, implicit-def dead $scc + ; CHECK-NEXT: [[S_ASHR_I32_:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_ADD_I32_]], 31, implicit-def dead $scc + ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_I32_]], %subreg.sub0, [[S_ASHR_I32_]], %subreg.sub1 + ; CHECK-NEXT: [[S_LSHL_B64_:%[0-9]+]]:sreg_64 = nsw S_LSHL_B64 [[REG_SEQUENCE3]], 2, implicit-def dead $scc + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[REG_SEQUENCE1]].sub0, [[S_LSHL_B64_]].sub0, 0, implicit $exec + ; CHECK-NEXT: %98:vgpr_32, dead $sgpr_null = V_ADDC_U32_e64 [[S_LSHL_B64_]].sub1, [[REG_SEQUENCE1]].sub1, [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %98, %subreg.sub1 + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE4]], 0, 0, implicit $exec :: (load (s32) from %ir.gep1, addrspace 1) + ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI2]], [[GLOBAL_LOAD_DWORD]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD]], [[PHI2]], implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_SHORT_D16_HI [[REG_SEQUENCE2]], [[V_MUL_LO_U32_e64_]], 2, 0, implicit $exec :: (store (s16) into %ir.p1 + 2, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_SHORT [[REG_SEQUENCE2]], [[V_MUL_LO_U32_e64_]], 0, 0, implicit $exec :: (store (s16) into %ir.p1, addrspace 1) + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[REG_SEQUENCE]].sub0, [[S_LSHL_B64_]].sub0, 0, implicit $exec + ; CHECK-NEXT: %106:vgpr_32, dead $sgpr_null = V_ADDC_U32_e64 [[S_LSHL_B64_]].sub1, [[REG_SEQUENCE]].sub1, [[V_ADD_CO_U32_e64_3]], 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_2]], %subreg.sub0, %106, %subreg.sub1 + ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[REG_SEQUENCE5]], 0, 0, implicit $exec :: (load (s16) from %ir.gep2, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[REG_SEQUENCE5]], 2, 0, implicit $exec :: (load (s16) from %ir.gep2 + 2, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_3:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_USHORT1]], 16, [[GLOBAL_LOAD_USHORT]], implicit $exec + ; CHECK-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI2]], 1, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI1]], -1, implicit-def dead $scc + ; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI1]], [[V_LSHL_OR_B32_e64_3]], 0, implicit $exec + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 [[PHI2]], [[SI_SPILL_V32_RESTORE]], implicit $exec + ; CHECK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[V_CMP_GE_U32_e64_]], [[PHI]], implicit-def dead $scc + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_1]], implicit $exec + ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.bb: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -2, [[COPY8]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_MUL_LO_U32_e64_]], [[V_ADD_U32_e64_2]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 100, [[PHI3]], implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], [[V_MUL_LO_U32_e64_2]], 0, 0, implicit $exec :: (store (s32) into %ir.p3, addrspace 1) + ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE1:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3.loop2: + ; CHECK-NEXT: successors: %bb.4(0x04000000), %bb.3(0x7c000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI6:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_2]], %bb.2, %27, %bb.3 + ; CHECK-NEXT: [[PHI7:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_2]], %bb.2, %26, %bb.3 + ; CHECK-NEXT: [[PHI8:%[0-9]+]]:vgpr_32 = PHI [[PHI5]], %bb.2, [[V_MUL_LO_U32_e64_1]], %bb.3 + ; CHECK-NEXT: [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI7]], 2, implicit-def dead $scc + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 [[S_ADD_I32_3]], [[SI_SPILL_V32_RESTORE1]], implicit $exec + ; CHECK-NEXT: [[SI_IF_BREAK1:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[V_CMP_GE_U32_e64_1]], [[PHI6]], implicit-def dead $scc + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_3]], implicit $exec + ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK1]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4.exit: + ; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], [[PHI8]], 0, 0, implicit $exec :: (store (s32) into %ir.p3, addrspace 1) + ; CHECK-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 [[PHI8]], [[COPY9]], [[PHI4]], implicit $exec + ; CHECK-NEXT: [[V_ADD3_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 [[V_ADD3_U32_e64_]], [[V_MUL_LO_U32_e64_2]], [[GLOBAL_LOAD_DWORD]], implicit $exec + ; CHECK-NEXT: [[V_ADD3_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 [[V_LSHL_OR_B32_e64_2]], [[V_ADD3_U32_e64_1]], 100, implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_ADD3_U32_e64_2]], implicit $exec + ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; CHECK-NEXT: SI_RETURN_TO_EPILOG killed $sgpr0 +entry: +; entry +; | +; +<-----+ +; loop1 | +; +------+ +; | +; bb +; | +; +<-----+ +; loop2 | +; +------+ +; | +; exit + %ld1 = load i32, ptr addrspace(1) %p1, align 1 + %add1 = add i32 %ld1, 100 + br label %loop1 + +loop1: + %phi.inc1 = phi i32 [ 0, %entry ], [ %inc1, %loop1 ] + %phi1 = phi i32 [ %ld1, %entry ], [ %add2, %loop1 ] + %phi2 = phi i32 [ 100, %entry ], [ %mul1, %loop1 ] + %phi3 = phi i32 [ %ld1, %entry ], [ %sub, %loop1 ] + %sext1 = sext i32 %phi.inc1 to i64 + %gep1 = getelementptr inbounds i32, ptr addrspace(1) %p2, i64 %sext1 + %ld2 = load i32, ptr addrspace(1) %gep1, align 4 + %inc1 = add i32 %phi.inc1, 1 + %add2 = add i32 %ld2, %inc1 + %mul1 = mul i32 %ld2, %inc1 + store i32 %mul1, ptr addrspace(1) %p1, align 2 + %mul2 = mul i32 %mul1, %phi.inc1 + %sext2 = sext i32 %inc1 to i64 + %gep2 = getelementptr inbounds i32, ptr addrspace(1) %p3, i64 %sext1 + %ld3 = load i32, ptr addrspace(1) %gep2, align 2 + %sub = sub i32 %ld3, %phi.inc1 + %cond1 = icmp ult i32 %inc1, %TC1 + br i1 %cond1, label %loop1, label %bb + +bb: + %mul3 = mul i32 %phi1, 100 + store i32 %mul3, ptr addrspace(1) %p3 + br label %loop2 + +loop2: + %phi.inc2 = phi i32 [ 0, %bb ], [ %inc2, %loop2 ] + %phi4 = phi i32 [ %phi3, %bb ], [ %mul2, %loop2 ] + %inc2 = add i32 %phi.inc2, 2 + store i32 %phi4, ptr addrspace(1) %p3 + %add3 = add i32 %phi4, %inc2 + %cond2 = icmp ult i32 %inc2, %TC2 + br i1 %cond2, label %loop2, label %exit + +exit: + %add4 = add i32 %add3, %phi2 + %add5 = add i32 %add4, %mul3 + %add6 = add i32 %add5, %ld2 + %add7 = add i32 %add6, %add1 + ret i32 %add7 +} + diff --git a/llvm/test/CodeGen/AMDGPU/test_ers_emit_restore_in_loop_preheader3.ll b/llvm/test/CodeGen/AMDGPU/test_ers_emit_restore_in_loop_preheader3.ll new file mode 100644 index 0000000000000..614b9aba84ea0 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/test_ers_emit_restore_in_loop_preheader3.ll @@ -0,0 +1,483 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -stop-after=amdgpu-early-register-spilling -verify-machineinstrs -print-after=amdgpu-early-register-spilling -max-vgprs=25 < %s 2>&1 | FileCheck %s + +; bb.0.entry +; / | +; bb.3.bb3 | +; \ | +; bb.1.Flow12 +; / | +; bb.2.bb2 | +; \ | +; bb.4.bb4 +; | +; bb.5.loop1.header<-------+ +; | | +; bb.6.loop2.header<-----+ | +; | | | +; bb.7.loop3.header<---+ | | +; / | | | | +; bb.8.bb5 | | | | +; \ | | | | +; bb.9.loop3.latch-----+ | | +; | | | +; bb.10.loop2.latch------+ | +; | | +; bb.11.loop4.preheader | +; | | +; bb.12.loop4<----+ | +; +----------+ | +; | | +; | | +; bb.13.loop1.latch--------+ +; | +; bb.14.bb6 +; / | +; bb.15.bb7 | +; \ | +; bb.16.loop5.preheader +; | +; +-->bb.17.loop5 +; +--------+ +; | +; bb.18.exit +define amdgpu_ps i32 @test (ptr addrspace(1) %p1, ptr addrspace(1) %p2, ptr addrspace(1) %p3, ptr addrspace(1) %p4, ptr addrspace(1) %p5, ptr addrspace(1) %p6, ptr addrspace(1) %p7, ptr addrspace(1) %p8, ptr addrspace(1) %p9, ptr addrspace(1) %p10, ptr addrspace(1) %p11, i32 %TC1, i32 %TC2, i32 %TC3, i32 %TC4, i32 %TC5, i32 %Val1, i32 %Val2, i1 %cond1) { + ; CHECK-LABEL: name: test + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr27 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr26 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr25 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr24 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr23 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr22 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr21 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr20 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr19 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr18 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr17 + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY $vgpr16 + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY $vgpr15 + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY $vgpr14 + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY $vgpr13 + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY $vgpr12 + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY $vgpr7 + ; CHECK-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY $vgpr6 + ; CHECK-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[COPY24:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[COPY3]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[COPY7]], %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) + ; CHECK-NEXT: [[COPY26:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[COPY1]], %stack.2, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) + ; CHECK-NEXT: [[COPY27:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[COPY4]], %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[COPY6]], %stack.4, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY27]], %subreg.sub0, [[COPY26]], %subreg.sub1 + ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 1, [[COPY]], implicit $exec + ; CHECK-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 1, [[V_AND_B32_e64_]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8) from %ir.p1, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 1, 0, implicit $exec :: (load (s8) from %ir.p1 + 1, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE1]], 8, [[GLOBAL_LOAD_UBYTE]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE2:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 2, 0, implicit $exec :: (load (s8) from %ir.p1 + 2, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE3:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 3, 0, implicit $exec :: (load (s8) from %ir.p1 + 3, addrspace 1) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[COPY5]], %stack.5, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE3]], 8, [[GLOBAL_LOAD_UBYTE2]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_1]], 16, [[V_LSHL_OR_B32_e64_]], implicit $exec + ; CHECK-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF [[V_CMP_NE_U32_e64_]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.Flow: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE [[SI_IF]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.bb2: + ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[COPY2]], [[V_LSHL_OR_B32_e64_2]], implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], [[V_MUL_LO_U32_e64_]], 0, 0, implicit $exec :: (store (s32) into %ir.p1, addrspace 1) + ; CHECK-NEXT: S_BRANCH %bb.4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3.bb3: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY2]], [[V_LSHL_OR_B32_e64_2]], 0, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_SHORT_D16_HI [[REG_SEQUENCE]], [[V_ADD_U32_e64_]], 2, 0, implicit $exec :: (store (s16) into %ir.p1 + 2, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_SHORT [[REG_SEQUENCE]], [[V_ADD_U32_e64_]], 0, 0, implicit $exec :: (store (s16) into %ir.p1, addrspace 1) + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4.bb4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: SI_END_CF [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY10]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY12]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY15]], %subreg.sub0, [[COPY14]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY17]], %subreg.sub0, [[COPY16]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY19]], %subreg.sub0, [[COPY18]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE7:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY21]], %subreg.sub0, [[COPY20]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE8:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY23]], %subreg.sub0, [[COPY22]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE9:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY25]], %subreg.sub0, [[COPY24]], %subreg.sub1 + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE4:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8) from %ir.p1, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE5:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 1, 0, implicit $exec :: (load (s8) from %ir.p1 + 1, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_3:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE5]], 8, [[GLOBAL_LOAD_UBYTE4]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE6:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 2, 0, implicit $exec :: (load (s8) from %ir.p1 + 2, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE7:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 3, 0, implicit $exec :: (load (s8) from %ir.p1 + 3, addrspace 1) + ; CHECK-NEXT: SI_SPILL_V64_SAVE [[REG_SEQUENCE5]], %stack.6, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.6, align 4, addrspace 5) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_4:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE7]], 8, [[GLOBAL_LOAD_UBYTE6]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_5:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_4]], 16, [[V_LSHL_OR_B32_e64_3]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 1, [[V_LSHL_OR_B32_e64_2]], 0, implicit $exec + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE1:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE2:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE3:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE4:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5.loop1.header: + ; CHECK-NEXT: successors: %bb.6(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.4, %67, %bb.13 + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[V_ADD_U32_e64_1]], %bb.4, %66, %bb.13 + ; CHECK-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[V_LSHL_OR_B32_e64_2]], %bb.4, %65, %bb.13 + ; CHECK-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.4, %23, %bb.13 + ; CHECK-NEXT: [[V_ASHRREV_I32_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I32_e64 31, [[PHI2]], implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE10:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[PHI2]], %subreg.sub0, [[V_ASHRREV_I32_e64_]], %subreg.sub1 + ; CHECK-NEXT: [[V_LSHLREV_B64_pseudo_e64_:%[0-9]+]]:vreg_64 = nsw V_LSHLREV_B64_pseudo_e64 3, [[REG_SEQUENCE10]], implicit $exec + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[REG_SEQUENCE]].sub0, [[V_LSHLREV_B64_pseudo_e64_]].sub0, 0, implicit $exec + ; CHECK-NEXT: %250:vgpr_32, dead $sgpr_null = V_ADDC_U32_e64 [[REG_SEQUENCE]].sub1, [[V_LSHLREV_B64_pseudo_e64_]].sub1, [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE11:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %250, %subreg.sub1 + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE8:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE11]], 0, 0, implicit $exec :: (load (s8) from %ir.gep1, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE9:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE11]], 1, 0, implicit $exec :: (load (s8) from %ir.gep1 + 1, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_6:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE9]], 8, [[GLOBAL_LOAD_UBYTE8]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE10:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE11]], 2, 0, implicit $exec :: (load (s8) from %ir.gep1 + 2, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE11:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE11]], 3, 0, implicit $exec :: (load (s8) from %ir.gep1 + 3, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_7:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE11]], 8, [[GLOBAL_LOAD_UBYTE10]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_8:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_7]], 16, [[V_LSHL_OR_B32_e64_6]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[COPY2]], [[PHI2]], implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_SHORT [[REG_SEQUENCE]], [[V_MUL_LO_U32_e64_1]], 0, 0, implicit $exec :: (store (s16) into %ir.p1, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_SHORT_D16_HI [[REG_SEQUENCE]], [[V_MUL_LO_U32_e64_1]], 2, 0, implicit $exec :: (store (s16) into %ir.p1 + 2, addrspace 1) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_LSHL_OR_B32_e64_8]], [[PHI1]], implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.6.loop2.header: + ; CHECK-NEXT: successors: %bb.7(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI4:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.5, %40, %bb.10 + ; CHECK-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[V_MUL_LO_U32_e64_2]], %bb.5, %39, %bb.10 + ; CHECK-NEXT: [[PHI6:%[0-9]+]]:vgpr_32 = PHI [[V_LSHL_OR_B32_e64_8]], %bb.5, %36, %bb.10 + ; CHECK-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI [[PHI2]], %bb.5, %25, %bb.10 + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.7.loop3.header: + ; CHECK-NEXT: successors: %bb.8(0x40000000), %bb.9(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI8:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_1]], %bb.6, %29, %bb.9 + ; CHECK-NEXT: [[PHI9:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_1]], %bb.6, %28, %bb.9 + ; CHECK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI9]], [[PHI6]], 0, implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE12:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE9]], 0, 0, implicit $exec :: (load (s8) from %ir.p2, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE13:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE9]], 1, 0, implicit $exec :: (load (s8) from %ir.p2 + 1, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_9:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE13]], 8, [[GLOBAL_LOAD_UBYTE12]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE14:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE9]], 2, 0, implicit $exec :: (load (s8) from %ir.p2 + 2, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE15:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE9]], 3, 0, implicit $exec :: (load (s8) from %ir.p2 + 3, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_10:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE15]], 8, [[GLOBAL_LOAD_UBYTE14]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_11:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_10]], 16, [[V_LSHL_OR_B32_e64_9]], implicit $exec + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 [[V_ADD_U32_e64_2]], [[V_LSHL_OR_B32_e64_11]], implicit $exec + ; CHECK-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF [[V_CMP_GE_U32_e64_]], %bb.9, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.8.bb5: + ; CHECK-NEXT: successors: %bb.9(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI9]], [[PHI5]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_ADD_U32_e64_3]], [[V_LSHL_OR_B32_e64_11]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MUL_LO_U32_e64_3]], [[V_MUL_LO_U32_e64_1]], 0, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE4]], [[V_ADD_U32_e64_4]], 0, 0, implicit $exec :: (store (s32) into %ir.p7, addrspace 1) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.9.loop3.latch: + ; CHECK-NEXT: successors: %bb.10(0x04000000), %bb.7(0x7c000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI10:%[0-9]+]]:vgpr_32 = PHI [[V_LSHL_OR_B32_e64_8]], %bb.7, [[V_ADD_U32_e64_4]], %bb.8 + ; CHECK-NEXT: [[PHI11:%[0-9]+]]:vgpr_32 = PHI [[V_ADD_U32_e64_2]], %bb.7, [[V_MUL_LO_U32_e64_3]], %bb.8 + ; CHECK-NEXT: [[PHI12:%[0-9]+]]:vgpr_32 = PHI [[V_ADD_U32_e64_2]], %bb.7, [[PHI6]], %bb.8 + ; CHECK-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE3]], [[PHI12]], 0, 0, implicit $exec :: (store (s32) into %ir.p9, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE2]], 0, 0, implicit $exec :: (load (s32) from %ir.p10, addrspace 1) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_4:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[PHI11]], [[PHI10]], implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[V_MUL_LO_U32_e64_4]], 0, 0, implicit $exec :: (store (s32) into %ir.p10, addrspace 1) + ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI9]], 1, implicit-def dead $scc + ; CHECK-NEXT: [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[S_ADD_I32_]], [[PHI6]], 0, implicit $exec + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 [[V_ADD_U32_e64_5]], [[SI_SPILL_V32_RESTORE4]], implicit $exec + ; CHECK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[V_CMP_GE_U32_e64_1]], [[PHI8]], implicit-def dead $scc + ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.10.loop2.latch: + ; CHECK-NEXT: successors: %bb.11(0x04000000), %bb.6(0x7c000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 1, [[PHI6]], 0, implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s32) from %ir.p11, addrspace 1) + ; CHECK-NEXT: [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_6]], [[PHI11]], 0, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE1]], [[V_ADD_U32_e64_7]], 0, 0, implicit $exec :: (store (s32) into %ir.p11, addrspace 1) + ; CHECK-NEXT: [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI5]], [[PHI1]], 0, implicit $exec + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_2:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 [[V_ADD_U32_e64_6]], [[SI_SPILL_V32_RESTORE3]], implicit $exec + ; CHECK-NEXT: [[SI_IF_BREAK1:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[V_CMP_GE_U32_e64_2]], [[PHI4]], implicit-def dead $scc + ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK1]], %bb.6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.11.loop4.preheader: + ; CHECK-NEXT: successors: %bb.12(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE12:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[PHI10]], %subreg.sub0, undef %237:vgpr_32, %subreg.sub1 + ; CHECK-NEXT: %188:vreg_64, $sgpr_null = V_MAD_U64_U32_e64 [[PHI10]], [[PHI11]], [[REG_SEQUENCE12]], 0, implicit $exec + ; CHECK-NEXT: [[COPY28:%[0-9]+]]:vgpr_32 = COPY %188.sub0 + ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.12.loop4: + ; CHECK-NEXT: successors: %bb.13(0x04000000), %bb.12(0x7c000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI13:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_2]], %bb.11, %61, %bb.12 + ; CHECK-NEXT: [[PHI14:%[0-9]+]]:vgpr_32 = PHI [[COPY28]], %bb.11, %60, %bb.12 + ; CHECK-NEXT: [[PHI15:%[0-9]+]]:vgpr_32 = PHI [[V_MUL_LO_U32_e64_4]], %bb.11, %57, %bb.12 + ; CHECK-NEXT: [[PHI16:%[0-9]+]]:vgpr_32 = PHI [[V_ADD_U32_e64_7]], %bb.11, [[PHI3]], %bb.12 + ; CHECK-NEXT: [[PHI17:%[0-9]+]]:vgpr_32 = PHI [[GLOBAL_LOAD_DWORD]], %bb.11, %58, %bb.12 + ; CHECK-NEXT: [[PHI18:%[0-9]+]]:vgpr_32 = PHI [[GLOBAL_LOAD_DWORD1]], %bb.11, %59, %bb.12 + ; CHECK-NEXT: [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI16]], [[PHI10]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_5:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[PHI18]], [[GLOBAL_LOAD_DWORD]], implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE8]], [[V_MUL_LO_U32_e64_5]], 0, 0, implicit $exec :: (store (s32) into %ir.p3, addrspace 1) + ; CHECK-NEXT: [[V_ADD_U32_e64_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI16]], [[PHI14]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_6:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[PHI7]], [[V_ADD_U32_e64_10]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_7:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[PHI17]], [[GLOBAL_LOAD_DWORD1]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MUL_LO_U32_e64_7]], [[V_MUL_LO_U32_e64_6]], 0, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE7]], [[V_ADD_U32_e64_11]], 0, 0, implicit $exec :: (store (s32) into %ir.p4, addrspace 1) + ; CHECK-NEXT: [[V_ADD_U32_e64_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 4, [[PHI15]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_HI_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 -1431655765, [[V_MUL_LO_U32_e64_6]], implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 1, [[V_MUL_HI_U32_e64_]], implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 31, [[V_ADD_U32_e64_9]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_9]], [[V_LSHRREV_B32_e64_1]], 0, implicit $exec + ; CHECK-NEXT: [[V_ASHRREV_I32_e64_1:%[0-9]+]]:vgpr_32 = V_ASHRREV_I32_e64 1, [[V_ADD_U32_e64_13]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 4, [[PHI14]], 0, implicit $exec + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_3:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 [[V_ADD_U32_e64_12]], [[SI_SPILL_V32_RESTORE2]], implicit $exec + ; CHECK-NEXT: [[SI_IF_BREAK2:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[V_CMP_GE_U32_e64_3]], [[PHI13]], implicit-def dead $scc + ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK2]], %bb.12, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.13 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.13.loop1.latch: + ; CHECK-NEXT: successors: %bb.14(0x04000000), %bb.5(0x7c000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK2]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MUL_LO_U32_e64_7]], [[SI_SPILL_V32_RESTORE1]], 0, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE6]], [[V_ADD_U32_e64_15]], 0, 0, implicit $exec :: (store (s32) into %ir.p5, addrspace 1) + ; CHECK-NEXT: [[V_ADD_U32_e64_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 1, [[PHI2]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 1, [[PHI1]], 0, implicit $exec + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_4:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 [[V_ADD_U32_e64_16]], [[SI_SPILL_V32_RESTORE]], implicit $exec + ; CHECK-NEXT: [[SI_IF_BREAK3:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[V_CMP_GE_U32_e64_4]], [[PHI]], implicit-def dead $scc + ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK3]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.14 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.14.bb6: + ; CHECK-NEXT: successors: %bb.15(0x40000000), %bb.16(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK3]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_8:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 100, [[V_LSHL_OR_B32_e64_5]], implicit $exec + ; CHECK-NEXT: [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GT_U32_e64 [[V_MUL_LO_U32_e64_8]], [[V_ADD_U32_e64_15]], implicit $exec + ; CHECK-NEXT: [[SI_IF2:%[0-9]+]]:sreg_32 = SI_IF [[V_CMP_GT_U32_e64_]], %bb.16, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.15 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.15.bb7: + ; CHECK-NEXT: successors: %bb.16(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.6, align 4, addrspace 5) + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[SI_SPILL_V64_RESTORE]], [[V_MUL_LO_U32_e64_8]], 0, 0, implicit $exec :: (store (s32) into %ir.p6, addrspace 1) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.16.loop5.preheader: + ; CHECK-NEXT: successors: %bb.17(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI19:%[0-9]+]]:vgpr_32 = PHI [[V_MUL_LO_U32_e64_8]], %bb.14, [[V_ADD_U32_e64_15]], %bb.15 + ; CHECK-NEXT: SI_END_CF [[SI_IF2]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE5:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.17.loop5: + ; CHECK-NEXT: successors: %bb.18(0x04000000), %bb.17(0x7c000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI20:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_3]], %bb.16, %76, %bb.17 + ; CHECK-NEXT: [[PHI21:%[0-9]+]]:vgpr_32 = PHI [[PHI19]], %bb.16, %75, %bb.17 + ; CHECK-NEXT: [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 2, [[PHI21]], 0, implicit $exec + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_5:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 [[V_ADD_U32_e64_18]], [[SI_SPILL_V32_RESTORE5]], implicit $exec + ; CHECK-NEXT: [[SI_IF_BREAK4:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[V_CMP_GE_U32_e64_5]], [[PHI20]], implicit-def dead $scc + ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK4]], %bb.17, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.18 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.18.exit: + ; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK4]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 100, [[V_LSHL_OR_B32_e64_2]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_9:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_MUL_LO_U32_e64_8]], [[PHI21]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_10:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_ADD_U32_e64_19]], [[V_MUL_LO_U32_e64_9]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_MUL_LO_U32_e64_10]], implicit $exec + ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; CHECK-NEXT: SI_RETURN_TO_EPILOG killed $sgpr0 +entry: +; entry +; | +; bb1 +; / \ +; bb2 bb3 +; \ / +; bb4 +; | +; loop1.header<-------+ +; | | +; loop2.header<-----+ | +; | | | +; loop3.header<---+ | | +; / | | | | +; bb5 | | | | +; \ | | | | +; loop3.latch-----+ | | +; | | | +; loop2.latch-------+ | +; | | +; +-->loop4| | +; +--------+ | +; | | +; loop1.latch---------+ +; | +; bb6 +; / | +; bb17 | +; | | +; +-->loop5 +; +-----+ +; | +; exit + + %ld1 = load i32, ptr addrspace(1) %p1, align 1 + %add1 = add i32 %ld1, 100 + br label %bb1 + +bb1: + br i1 %cond1, label %bb2, label %bb3 + +bb2: + %mul1 = mul i32 %Val1, %ld1 + store i32 %mul1, ptr addrspace(1) %p1, align 4 + br label %bb4 + +bb3: + %add2 = add i32 %Val1, %ld1 + store i32 %add2, ptr addrspace(1) %p1, align 2 + br label %bb4 + +bb4: + %phi1 = phi i32 [ %mul1, %bb2 ], [ %add2, %bb3 ] + %ld2 = load i32, ptr addrspace(1) %p1, align 1 + br label %loop1.header + +loop1.header: + %phi.inc1 = phi i32 [ %ld1, %bb4 ], [ %inc1, %loop1.latch ] + %phi.phi = phi i32 [ 0, %bb4 ], [ %phi2, %loop1.latch ] + %sext1 = sext i32 %phi.inc1 to i64 + %gep1 = getelementptr inbounds i64, ptr addrspace(1) %p1, i64 %sext1 + %ld3 = load i32, ptr addrspace(1) %gep1, align 1 + %mul2 = mul i32 %Val1, %phi.inc1 + store i32 %mul2, ptr addrspace(1) %p1, align 2 + br label %loop2.header + +loop2.header: + %phi.inc2 = phi i32 [ %ld3, %loop1.header ], [ %inc2, %loop2.latch ] + %phi6 = phi i32 [ %phi.inc1, %loop1.header ], [ %phi5, %loop2.latch ] + br label %loop3.header + +loop3.header: + %phi.inc3 = phi i32 [ %phi.inc2, %loop2.header ], [ %inc3, %loop3.latch ] + %ld4 = load i32, ptr addrspace(1) %p2, align 1 + %cond2 = icmp uge i32 %phi.inc3, %ld4 + br i1 %cond2, label %bb5, label %loop3.latch + +bb5: + %mul3 = mul i32 %phi.inc1, %phi.inc2 + %add3 = add i32 %mul3, %phi.inc3 + %mul4 = mul i32 %add3, %ld4 + %add4 = add i32 %mul4, %mul2 + store i32 %add4, ptr addrspace(1) %p7 + br label %loop3.latch + +loop3.latch: + %phi2 = phi i32 [ %add4, %bb5 ], [ %ld3, %loop3.header ] + %phi4 = phi i32 [ %mul4, %bb5 ], [ %phi.inc3, %loop3.header ] + %phi5 = phi i32 [ %phi.inc2, %bb5 ], [ %phi.inc3, %loop3.header ] + store i32 %phi5, ptr addrspace(1) %p9 + %inc3 = add i32 %phi.inc3, 1 + %ld10 = load i32, ptr addrspace(1) %p10 + %mul11 = mul i32 %phi4, %phi2 + store i32 %mul11, ptr addrspace(1) %p10 + %cond3 = icmp ult i32 %inc3, %TC3 + br i1 %cond3, label %loop3.header, label %loop2.latch + +loop2.latch: + %inc2 = add i32 %phi.inc2, 1 + %ld11 = load i32, ptr addrspace(1) %p11 + %add9 = add i32 %inc2, %phi4 + store i32 %add9, ptr addrspace(1) %p11 + %cond4 = icmp ult i32 %inc2, %TC2 + br i1 %cond4, label %loop2.header, label %loop4 + +loop4: + %phi.inc4 = phi i32 [ %mul11, %loop2.latch ], [ %inc4, %loop4 ] + %phi7 = phi i32 [ %add9, %loop2.latch ], [ %phi.phi, %loop4 ] + %phi.div1 = phi i32 [ %ld10, %loop2.latch ], [ %div1, %loop4 ] + %phi.div2 = phi i32 [ %ld11, %loop2.latch ], [ %div2, %loop4 ] + %add5 = add i32 %phi7, %phi2 + %mul5 = mul i32 %phi.div2, %ld10 + store i32 %mul5, ptr addrspace(1) %p3 + %add6 = add i32 %add5, %phi.inc4 + %mul8 = mul i32 %phi6, %add6 + %mul9 = mul i32 %phi.div1, %ld11 + %add10 = add i32 %mul9, %mul8 + store i32 %add10, ptr addrspace(1) %p4 + %inc4 = add i32 %phi.inc4, 4 + %div1 = udiv i32 %mul8, 3 + %div2 = sdiv i32 %add5, 2 + %cond7 = icmp ult i32 %inc4, %TC4 + br i1 %cond7, label %loop4, label %loop1.latch + +loop1.latch: + %add7 = add i32 %mul9, %Val2 + store i32 %add7, ptr addrspace(1) %p5 + %inc1 = add i32 %phi.inc1, 1 + %cond5 = icmp ult i32 %inc1, %TC1 + br i1 %cond5, label %loop1.header, label %bb6 + +bb6: + %mul6 = mul i32 %ld2, 100 + %cond8 = icmp ugt i32 %mul6, %add7 + br i1 %cond8, label %bb7, label %loop5 + +bb7: + store i32 %mul6, ptr addrspace(1) %p6 + br label %loop5 + +loop5: + %phi.inc5 = phi i32 [ %add7, %bb7 ], [ %mul6, %bb6 ], [ %inc5, %loop5 ] + %add8 = mul i32 %mul6, %phi.inc5 + %inc5 = add i32 %phi.inc5, 2 + %cond9 = icmp ult i32 %inc5, %TC5 + br i1 %cond9, label %loop5, label %exit + +exit: + %mul7 = mul i32 %add1, %add8 + ret i32 %mul7 +} diff --git a/llvm/test/CodeGen/AMDGPU/test_ers_emit_restore_in_loop_preheader4.ll b/llvm/test/CodeGen/AMDGPU/test_ers_emit_restore_in_loop_preheader4.ll new file mode 100644 index 0000000000000..8c2a712eef5c6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/test_ers_emit_restore_in_loop_preheader4.ll @@ -0,0 +1,195 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -stop-after=amdgpu-early-register-spilling -verify-machineinstrs -print-after=amdgpu-early-register-spilling -max-vgprs=10 < %s 2>&1 | FileCheck %s + +; +; bb.0.entry +; | +; +<--------+ +; bb.1.loop1 | +; +---------+ +; | +; bb.2.bb +; | +; +<--------+ +; bb.3.loop2 | +; +---------+ +; | +; bb.4.exit +; +define amdgpu_ps i32 @test(ptr addrspace(1) %p1, ptr addrspace(1) %p2, ptr addrspace(1) %p3, ptr addrspace(1) %p4, i32 %TC1, i32 %TC2) { + ; CHECK-LABEL: name: test + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr7 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr6 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[COPY]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[COPY1]], %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (load (s16) from %ir.p4, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[REG_SEQUENCE3]], 2, 0, implicit $exec :: (load (s16) from %ir.p4 + 2, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_USHORT1]], 16, [[GLOBAL_LOAD_USHORT]], implicit $exec + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[V_LSHL_OR_B32_e64_]], %stack.2, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE2]], 0, 0, implicit $exec :: (load (s8) from %ir.p1, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE2]], 1, 0, implicit $exec :: (load (s8) from %ir.p1 + 1, addrspace 1) + ; CHECK-NEXT: SI_SPILL_V64_SAVE [[REG_SEQUENCE]], %stack.3, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.3, align 4, addrspace 5) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE1]], 8, [[GLOBAL_LOAD_UBYTE]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE2:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE2]], 2, 0, implicit $exec :: (load (s8) from %ir.p1 + 2, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE3:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE2]], 3, 0, implicit $exec :: (load (s8) from %ir.p1 + 3, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE3]], 8, [[GLOBAL_LOAD_UBYTE2]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_3:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_2]], 16, [[V_LSHL_OR_B32_e64_1]], implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 100, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 100, [[V_LSHL_OR_B32_e64_3]], 0, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE3]], [[V_ADD_U32_e64_]], 0, 0, implicit $exec :: (store (s32) into %ir.p4, addrspace 1) + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.3, align 4, addrspace 5) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.loop1: + ; CHECK-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_1]], %bb.0, %15, %bb.1 + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_1]], %bb.0, %13, %bb.1 + ; CHECK-NEXT: [[PHI2:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %12, %bb.1 + ; CHECK-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[V_LSHL_OR_B32_e64_3]], %bb.0, %10, %bb.1 + ; CHECK-NEXT: [[PHI4:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.0, %11, %bb.1 + ; CHECK-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[V_LSHL_OR_B32_e64_3]], %bb.0, %14, %bb.1 + ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI2]], -1, implicit-def dead $scc + ; CHECK-NEXT: [[S_ASHR_I32_:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_ADD_I32_]], 31, implicit-def dead $scc + ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_I32_]], %subreg.sub0, [[S_ASHR_I32_]], %subreg.sub1 + ; CHECK-NEXT: [[S_LSHL_B64_:%[0-9]+]]:sreg_64 = nsw S_LSHL_B64 [[REG_SEQUENCE4]], 2, implicit-def dead $scc + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[REG_SEQUENCE1]].sub0, [[S_LSHL_B64_]].sub0, 0, implicit $exec + ; CHECK-NEXT: %110:vgpr_32, dead $sgpr_null = V_ADDC_U32_e64 [[S_LSHL_B64_]].sub1, [[REG_SEQUENCE1]].sub1, [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %110, %subreg.sub1 + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE5]], 0, 0, implicit $exec :: (load (s32) from %ir.gep1, addrspace 1) + ; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI2]], [[GLOBAL_LOAD_DWORD]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD]], [[PHI2]], implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_SHORT_D16_HI [[REG_SEQUENCE2]], [[V_MUL_LO_U32_e64_]], 2, 0, implicit $exec :: (store (s16) into %ir.p1 + 2, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_SHORT [[REG_SEQUENCE2]], [[V_MUL_LO_U32_e64_]], 0, 0, implicit $exec :: (store (s16) into %ir.p1, addrspace 1) + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[SI_SPILL_V64_RESTORE]].sub0, [[S_LSHL_B64_]].sub0, 0, implicit $exec + ; CHECK-NEXT: %118:vgpr_32, dead $sgpr_null = V_ADDC_U32_e64 [[S_LSHL_B64_]].sub1, [[SI_SPILL_V64_RESTORE]].sub1, [[V_ADD_CO_U32_e64_3]], 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_2]], %subreg.sub0, %118, %subreg.sub1 + ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT2:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[REG_SEQUENCE6]], 0, 0, implicit $exec :: (load (s16) from %ir.gep2, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT3:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[REG_SEQUENCE6]], 2, 0, implicit $exec :: (load (s16) from %ir.gep2 + 2, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_4:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_USHORT3]], 16, [[GLOBAL_LOAD_USHORT2]], implicit $exec + ; CHECK-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI2]], 1, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI1]], -1, implicit-def dead $scc + ; CHECK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI1]], [[V_LSHL_OR_B32_e64_4]], 0, implicit $exec + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 [[PHI2]], [[SI_SPILL_V32_RESTORE]], implicit $exec + ; CHECK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[V_CMP_GE_U32_e64_]], [[PHI]], implicit-def dead $scc + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_1]], implicit $exec + ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.bb: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -2, [[COPY10]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_MUL_LO_U32_e64_]], [[V_ADD_U32_e64_3]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 100, [[PHI3]], implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE1:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_MUL_LO_U32_e64_2]], [[SI_SPILL_V32_RESTORE1]], implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE1:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.3, align 4, addrspace 5) + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[SI_SPILL_V64_RESTORE1]], [[V_MUL_LO_U32_e64_3]], 0, 0, implicit $exec :: (store (s32) into %ir.p3, addrspace 1) + ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE2:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE3:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3.loop2: + ; CHECK-NEXT: successors: %bb.4(0x04000000), %bb.3(0x7c000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI6:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_2]], %bb.2, %29, %bb.3 + ; CHECK-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI [[SI_SPILL_V32_RESTORE3]], %bb.2, %28, %bb.3 + ; CHECK-NEXT: [[PHI8:%[0-9]+]]:vgpr_32 = PHI [[PHI5]], %bb.2, [[V_MUL_LO_U32_e64_1]], %bb.3 + ; CHECK-NEXT: [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 2, [[PHI7]], 0, implicit $exec + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 [[V_ADD_U32_e64_4]], [[SI_SPILL_V32_RESTORE2]], implicit $exec + ; CHECK-NEXT: [[SI_IF_BREAK1:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[V_CMP_GE_U32_e64_1]], [[PHI6]], implicit-def dead $scc + ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK1]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4.exit: + ; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[SI_SPILL_V64_RESTORE1]], [[PHI8]], 0, 0, implicit $exec :: (store (s32) into %ir.p3, addrspace 1) + ; CHECK-NEXT: [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI8]], [[V_ADD_U32_e64_4]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 [[V_ADD_U32_e64_5]], [[PHI4]], [[V_MUL_LO_U32_e64_2]], implicit $exec + ; CHECK-NEXT: [[V_ADD3_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 [[V_ADD3_U32_e64_]], [[GLOBAL_LOAD_DWORD]], [[V_ADD_U32_e64_]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_ADD3_U32_e64_1]], implicit $exec + ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; CHECK-NEXT: SI_RETURN_TO_EPILOG killed $sgpr0 +entry: +; entry +; | +; +<-----+ +; loop1 | +; +------+ +; | +; bb +; | +; +<-----+ +; loop2 | +; +------+ +; | +; exit + %ld0 = load i32, ptr addrspace(1) %p4, align 2 + %ld1 = load i32, ptr addrspace(1) %p1, align 1 + %add1 = add i32 %ld1, 100 + store i32 %add1, ptr addrspace(1) %p4, align 4 + br label %loop1 + +loop1: + %phi.inc1 = phi i32 [ 0, %entry ], [ %inc1, %loop1 ] + %phi1 = phi i32 [ %ld1, %entry ], [ %add2, %loop1 ] + %phi2 = phi i32 [ 100, %entry ], [ %mul1, %loop1 ] + %phi3 = phi i32 [ %ld1, %entry ], [ %sub, %loop1 ] + %sext1 = sext i32 %phi.inc1 to i64 + %gep1 = getelementptr inbounds i32, ptr addrspace(1) %p2, i64 %sext1 + %ld2 = load i32, ptr addrspace(1) %gep1, align 4 + %inc1 = add i32 %phi.inc1, 1 + %add2 = add i32 %ld2, %inc1 + %mul1 = mul i32 %ld2, %inc1 + store i32 %mul1, ptr addrspace(1) %p1, align 2 + %mul2 = mul i32 %mul1, %phi.inc1 + %sext2 = sext i32 %inc1 to i64 + %gep2 = getelementptr inbounds i32, ptr addrspace(1) %p3, i64 %sext1 + %ld3 = load i32, ptr addrspace(1) %gep2, align 2 + %sub = sub i32 %ld3, %phi.inc1 + %cond1 = icmp ult i32 %inc1, %TC1 + br i1 %cond1, label %loop1, label %bb + +bb: + %mul3 = mul i32 %phi1, 100 + %mul4 = mul i32 %mul3, %ld0 + store i32 %mul4, ptr addrspace(1) %p3 + br label %loop2 + +loop2: + %phi.inc2 = phi i32 [ %ld0, %bb ], [ %inc2, %loop2 ] + %phi4 = phi i32 [ %phi3, %bb ], [ %mul2, %loop2 ] + %inc2 = add i32 %phi.inc2, 2 + store i32 %phi4, ptr addrspace(1) %p3 + %add3 = add i32 %phi4, %inc2 + %cond2 = icmp ult i32 %inc2, %TC2 + br i1 %cond2, label %loop2, label %exit + +exit: + %add4 = add i32 %add3, %phi2 + %add5 = add i32 %add4, %mul3 + %add6 = add i32 %add5, %ld2 + %add7 = add i32 %add6, %add1 + ret i32 %add7 +} + diff --git a/llvm/test/CodeGen/AMDGPU/test_ers_keep_spilled_reg_live.ll b/llvm/test/CodeGen/AMDGPU/test_ers_keep_spilled_reg_live.ll new file mode 100644 index 0000000000000..2dfae88b80b49 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/test_ers_keep_spilled_reg_live.ll @@ -0,0 +1,202 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -stop-after=amdgpu-early-register-spilling -verify-machineinstrs -print-after=amdgpu-early-register-spilling -max-vgprs=10 < %s 2>&1 | FileCheck %s +; +; bb.0.entry +; / | +; bb.3.bb2 | +; / | | +; bb.9.bb5 | | +; \ | | +; bb.1.Flow1 | +; \ | +; bb.8.Flow +; / | +; bb.2.bb1 | +; \ | +; bb.6.Flow2 +; / | +; bb.7.bb4 | +; \ | +; bb.4.Flow3 +; / | +; bb.5.bb3 | +; \ | +; bb.10.exit +; +define amdgpu_ps i64 @test(i1 %cond, ptr addrspace(3) %p, i64 %val) { + ; CHECK-LABEL: name: test + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.8(0x40000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 1, [[COPY3]], implicit $exec + ; CHECK-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 1, [[V_AND_B32_e64_]], implicit $exec + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[COPY]], %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[COPY2]], %stack.2, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) + ; CHECK-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF [[V_CMP_NE_U32_e64_]], %bb.8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.Flow1: + ; CHECK-NEXT: successors: %bb.8(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI %35, %bb.3, %80, %bb.9 + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:vreg_64 = PHI undef %33:vreg_64, %bb.3, %20, %bb.9 + ; CHECK-NEXT: SI_END_CF %6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[PHI]], $exec_lo, implicit-def dead $scc + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_AND_B32_]] + ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: S_BRANCH %bb.8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.bb1: + ; CHECK-NEXT: successors: %bb.6(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DS_READ_U16_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 %121, 0, 0, implicit $exec :: (load (s16) from %ir.p, addrspace 3) + ; CHECK-NEXT: [[DS_READ_U16_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 %121, 2, 0, implicit $exec :: (load (s16) from %ir.p + 2, addrspace 3) + ; CHECK-NEXT: [[DS_READ_U16_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 %121, 4, 0, implicit $exec :: (load (s16) from %ir.p + 4, addrspace 3) + ; CHECK-NEXT: [[DS_READ_U16_gfx9_3:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 %121, 6, 0, implicit $exec :: (load (s16) from %ir.p + 6, addrspace 3) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[DS_READ_U16_gfx9_3]], 16, [[DS_READ_U16_gfx9_2]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[DS_READ_U16_gfx9_1]], 16, [[DS_READ_U16_gfx9_]], implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_LSHL_OR_B32_e64_1]], %subreg.sub0, [[V_LSHL_OR_B32_e64_]], %subreg.sub1 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $exec_lo + ; CHECK-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32 = S_ANDN2_B32 %18, $exec_lo, implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_NE_U32_e64_]], $exec_lo, implicit-def dead $scc + ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_ANDN2_B32_]], [[S_AND_B32_1]], implicit-def dead $scc + ; CHECK-NEXT: S_BRANCH %bb.6 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3.bb2: + ; CHECK-NEXT: successors: %bb.9(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 [[COPY2]], 8, 0, implicit $exec :: (load (s64) from %ir.gep2, addrspace 3) + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; CHECK-NEXT: SI_SPILL_V64_SAVE [[DS_READ_B64_gfx9_]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF [[V_CMP_NE_U32_e64_]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.9 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4.Flow3: + ; CHECK-NEXT: successors: %bb.5(0x40000000), %bb.10(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI2:%[0-9]+]]:sreg_32 = PHI %12, %bb.6, %82, %bb.7 + ; CHECK-NEXT: [[PHI3:%[0-9]+]]:vreg_64 = PHI %16, %bb.6, %15, %bb.7 + ; CHECK-NEXT: SI_END_CF %14, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[SI_IF2:%[0-9]+]]:sreg_32 = SI_IF [[PHI2]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5.bb3: + ; CHECK-NEXT: successors: %bb.10(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 %121, 6, 7, 0, implicit $exec :: (load (s64) from %ir.gep3, align 4, addrspace 3) + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[DS_READ2_B32_gfx9_]].sub0, [[COPY1]], 0, implicit $exec + ; CHECK-NEXT: %95:vgpr_32, dead $sgpr_null = V_ADDC_U32_e64 [[DS_READ2_B32_gfx9_]].sub1, %119, [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %95, %subreg.sub1 + ; CHECK-NEXT: S_BRANCH %bb.10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.6.Flow2: + ; CHECK-NEXT: successors: %bb.7(0x40000000), %bb.4(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI4:%[0-9]+]]:sreg_32 = PHI %18, %bb.8, [[S_OR_B32_]], %bb.2 + ; CHECK-NEXT: [[PHI5:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.8, [[COPY6]], %bb.2 + ; CHECK-NEXT: [[PHI6:%[0-9]+]]:vreg_64 = PHI %17, %bb.8, [[COPY5]], %bb.2 + ; CHECK-NEXT: SI_END_CF %19, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) + ; CHECK-NEXT: [[SI_IF3:%[0-9]+]]:sreg_32 = SI_IF [[PHI4]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.7.bb4: + ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[PHI6]].sub0, [[COPY1]], 0, implicit $exec + ; CHECK-NEXT: %103:vgpr_32, dead $sgpr_null = V_ADDC_U32_e64 [[PHI6]].sub1, [[SI_SPILL_V32_RESTORE]], [[V_ADD_CO_U32_e64_3]], 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_2]], %subreg.sub0, %103, %subreg.sub1 + ; CHECK-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[PHI5]], $exec_lo, implicit-def dead $scc + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_ANDN2_B32_1]] + ; CHECK-NEXT: S_BRANCH %bb.4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.8.Flow: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.6(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI7:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, [[COPY4]], %bb.1 + ; CHECK-NEXT: [[PHI8:%[0-9]+]]:vreg_64 = PHI undef %27:vreg_64, %bb.0, [[PHI1]], %bb.1 + ; CHECK-NEXT: [[PHI9:%[0-9]+]]:vreg_64 = PHI undef %27:vreg_64, %bb.0, [[SI_SPILL_V64_RESTORE]], %bb.1 + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE1:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; CHECK-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE [[SI_IF]], %bb.6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.9.bb5: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DS_READ_U8_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY2]], 32, 0, implicit $exec :: (load (s8) from %ir.gep4, addrspace 3) + ; CHECK-NEXT: [[DS_READ_U8_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY2]], 33, 0, implicit $exec :: (load (s8) from %ir.gep4 + 1, addrspace 3) + ; CHECK-NEXT: [[DS_READ_U8_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY2]], 34, 0, implicit $exec :: (load (s8) from %ir.gep4 + 2, addrspace 3) + ; CHECK-NEXT: [[DS_READ_U8_gfx9_3:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY2]], 35, 0, implicit $exec :: (load (s8) from %ir.gep4 + 3, addrspace 3) + ; CHECK-NEXT: [[DS_READ_U8_gfx9_4:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY2]], 36, 0, implicit $exec :: (load (s8) from %ir.gep4 + 4, addrspace 3) + ; CHECK-NEXT: [[DS_READ_U8_gfx9_5:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY2]], 37, 0, implicit $exec :: (load (s8) from %ir.gep4 + 5, addrspace 3) + ; CHECK-NEXT: [[DS_READ_U8_gfx9_6:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY2]], 38, 0, implicit $exec :: (load (s8) from %ir.gep4 + 6, addrspace 3) + ; CHECK-NEXT: [[DS_READ_U8_gfx9_7:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY2]], 39, 0, implicit $exec :: (load (s8) from %ir.gep4 + 7, addrspace 3) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[DS_READ_U8_gfx9_5]], 8, [[DS_READ_U8_gfx9_4]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_3:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[DS_READ_U8_gfx9_7]], 8, [[DS_READ_U8_gfx9_6]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_4:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_3]], 16, [[V_LSHL_OR_B32_e64_2]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_5:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[DS_READ_U8_gfx9_1]], 8, [[DS_READ_U8_gfx9_]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_6:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[DS_READ_U8_gfx9_3]], 8, [[DS_READ_U8_gfx9_2]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_7:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_6]], 16, [[V_LSHL_OR_B32_e64_5]], implicit $exec + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_4:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_5:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[V_LSHL_OR_B32_e64_7]], [[COPY1]], 0, implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE2:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) + ; CHECK-NEXT: %111:vgpr_32, dead $sgpr_null = V_ADDC_U32_e64 [[V_LSHL_OR_B32_e64_4]], [[SI_SPILL_V32_RESTORE2]], [[V_ADD_CO_U32_e64_5]], 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_4]], %subreg.sub0, %111, %subreg.sub1 + ; CHECK-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, -1, implicit-def dead $scc + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.10.exit: + ; CHECK-NEXT: [[PHI10:%[0-9]+]]:vreg_64 = PHI [[PHI3]], %bb.4, [[REG_SEQUENCE1]], %bb.5 + ; CHECK-NEXT: SI_END_CF [[SI_IF2]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[PHI10]].sub0, implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[PHI10]].sub1, implicit $exec + ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; CHECK-NEXT: SI_RETURN_TO_EPILOG killed $sgpr0, killed $sgpr1 +entry: +; entry +; / \ +; bb1 bb2 +; / \ / \ +; bb3 bb4 bb5 +; \ | / +; exit + br i1 %cond, label %bb1, label %bb2 + +bb1: + %gep1 = getelementptr inbounds i64, ptr addrspace(3) %p, i64 0 + %ld1 = load i64, ptr addrspace(3) %gep1, align 2 + br i1 %cond, label %bb3, label %bb4 + +bb2: + %gep2 = getelementptr inbounds i64, ptr addrspace(3) %p, i64 1 + %ld2 = load i64, ptr addrspace(3) %gep2, align 8 + br i1 %cond, label %bb4, label %bb5 + +bb3: + %gep3 = getelementptr inbounds i64, ptr addrspace(3) %p, i64 3 + %ld3 = load i64, ptr addrspace(3) %gep3, align 4 + %add1 = add i64 %ld3, %val + br label %exit + +bb4: + %phi1 = phi i64 [ %ld1, %bb1 ], [ %ld2, %bb2] + %add2 = add i64 %phi1, %val + br label %exit + +bb5: + %gep4 = getelementptr inbounds i64, ptr addrspace(3) %p, i64 4 + %ld4 = load i64, ptr addrspace(3) %gep4, align 1 + %add3 = add i64 %ld4, %val + br label %exit + +exit: + %phi2 = phi i64 [ %add1, %bb3 ], [ %add2, %bb4 ], [ %add3, %bb5 ] + ret i64 %phi2 +} diff --git a/llvm/test/CodeGen/AMDGPU/test_ers_multiple_spills1.ll b/llvm/test/CodeGen/AMDGPU/test_ers_multiple_spills1.ll new file mode 100644 index 0000000000000..9fca38d152c4a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/test_ers_multiple_spills1.ll @@ -0,0 +1,722 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -stop-after=amdgpu-early-register-spilling -verify-machineinstrs -print-after=amdgpu-early-register-spilling -max-vgprs=50 < %s 2>&1 | FileCheck %s + +@array2 = global [5 x i32] zeroinitializer, align 4 +@array3 = global [5 x i32] zeroinitializer, align 4 +@array4 = global [5 x i32] zeroinitializer, align 4 +@array5 = global [5 x i32] zeroinitializer, align 4 + +@array6 = global [5 x i32] zeroinitializer, align 4 +@array7 = global [5 x i32] zeroinitializer, align 4 +@array8 = global [5 x i32] zeroinitializer, align 4 +@array9 = global [5 x i32] zeroinitializer, align 4 + +; bb.0.entry +; / | +; bb.3.bb2 | +; \ | +; bb.1.Flow3 +; / | +; bb.2.bb1 | +; \ | +; bb.4.bb3 +; / | +; bb.7.bb5 | +; \ | +; bb.5.Flow2 +; / | +; bb.6.bb4 | +; \ | +; bb.8.bb6 +; / | +; bb.11.bb8 | +; \ | +; bb.9.Flow +; / | +; bb.10.bb7 | +; \ | +; bb.12.Flow1 +; / | +; bb.13.bb9 | +; \ | +; bb.14.bb10 +; +define amdgpu_ps void @test(ptr addrspace(1) %p1, ptr addrspace(3) %p2, ptr addrspace(1) %p3, ptr addrspace(1) %p4, ptr addrspace(1) %p5, ptr addrspace(1) %p6, ptr addrspace(1) %p7, ptr addrspace(1) %p8, ptr addrspace(1) %p9, ptr addrspace(1) %p10) { + ; CHECK-LABEL: name: test + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr18 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr17 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr16 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr15 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr14 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr13 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr12 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY $vgpr7 + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY $vgpr6 + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY18]], %subreg.sub0, [[COPY17]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY10]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY12]], %subreg.sub1 + ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (load (s16) from %ir.p4, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[REG_SEQUENCE3]], 2, 0, implicit $exec :: (load (s16) from %ir.p4 + 2, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_USHORT1]], 16, [[GLOBAL_LOAD_USHORT]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE2]], 0, 0, implicit $exec :: (load (s32) from %ir.p5, align 8, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s32) from %ir.p1, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 12, 0, implicit $exec :: (load (s8) from %ir.gep1, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 13, 0, implicit $exec :: (load (s8) from %ir.gep1 + 1, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE1]], 8, [[GLOBAL_LOAD_UBYTE]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE2:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 14, 0, implicit $exec :: (load (s8) from %ir.gep1 + 2, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE3:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 15, 0, implicit $exec :: (load (s8) from %ir.gep1 + 3, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE3]], 8, [[GLOBAL_LOAD_UBYTE2]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_LSHL_OR_B32_e64_]], [[GLOBAL_LOAD_DWORD]], 0, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE1]], [[V_ADD_U32_e64_]], 0, 0, implicit $exec :: (store (s32) into %ir.p1, addrspace 1) + ; CHECK-NEXT: [[V_CMP_LT_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_U32_e64 [[GLOBAL_LOAD_DWORD]], [[V_ADD_U32_e64_]], implicit $exec + ; CHECK-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF [[V_CMP_LT_U32_e64_]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.Flow6: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %63:vgpr_32, %bb.0, %13, %bb.3 + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %63:vgpr_32, %bb.0, %12, %bb.3 + ; CHECK-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI undef %63:vgpr_32, %bb.0, %14, %bb.3 + ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY15]], %subreg.sub0, [[COPY14]], %subreg.sub1 + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_3:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_2]], 16, [[V_LSHL_OR_B32_e64_1]], implicit $exec + ; CHECK-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE [[SI_IF]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.bb1: + ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE4]], 0, 0, implicit $exec :: (load (s128) from %ir.p3, align 4, addrspace 1) + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub2 + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub0 + ; CHECK-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub1 + ; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[GLOBAL_LOAD_DWORDX4_]].sub0, [[GLOBAL_LOAD_DWORDX4_]].sub1, 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_ADD_U32_e64_1]], [[GLOBAL_LOAD_DWORDX4_]].sub2, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE4]], [[V_MUL_LO_U32_e64_]], 0, 0, implicit $exec :: (store (s32) into %ir.p3, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], [[V_ADD_U32_e64_1]], 0, 0, implicit $exec :: (store (s32) into %ir.p8, addrspace 1) + ; CHECK-NEXT: S_BRANCH %bb.4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3.bb2: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DS_READ_U8_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY16]], 0, 0, implicit $exec :: (load (s8) from %ir.p2, addrspace 3) + ; CHECK-NEXT: [[DS_READ_U8_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY16]], 1, 0, implicit $exec :: (load (s8) from %ir.p2 + 1, addrspace 3) + ; CHECK-NEXT: [[DS_READ_U8_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY16]], 2, 0, implicit $exec :: (load (s8) from %ir.p2 + 2, addrspace 3) + ; CHECK-NEXT: [[DS_READ_U8_gfx9_3:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY16]], 3, 0, implicit $exec :: (load (s8) from %ir.p2 + 3, addrspace 3) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_4:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[DS_READ_U8_gfx9_1]], 8, [[DS_READ_U8_gfx9_]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_5:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[DS_READ_U8_gfx9_3]], 8, [[DS_READ_U8_gfx9_2]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_6:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_5]], 16, [[V_LSHL_OR_B32_e64_4]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE4:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 4, 0, implicit $exec :: (load (s8) from %ir.p8 + 4, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE5:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 5, 0, implicit $exec :: (load (s8) from %ir.p8 + 5, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_7:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE5]], 8, [[GLOBAL_LOAD_UBYTE4]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE6:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 6, 0, implicit $exec :: (load (s8) from %ir.p8 + 6, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE7:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 7, 0, implicit $exec :: (load (s8) from %ir.p8 + 7, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_8:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE7]], 8, [[GLOBAL_LOAD_UBYTE6]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_9:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_8]], 16, [[V_LSHL_OR_B32_e64_7]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE8:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 8, 0, implicit $exec :: (load (s8) from %ir.p8 + 8, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE9:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 9, 0, implicit $exec :: (load (s8) from %ir.p8 + 9, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_10:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE9]], 8, [[GLOBAL_LOAD_UBYTE8]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE10:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 10, 0, implicit $exec :: (load (s8) from %ir.p8 + 10, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE11:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 11, 0, implicit $exec :: (load (s8) from %ir.p8 + 11, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_11:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE11]], 8, [[GLOBAL_LOAD_UBYTE10]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_12:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_11]], 16, [[V_LSHL_OR_B32_e64_10]], implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_LSHL_OR_B32_e64_12]], %subreg.sub0, undef %448:vgpr_32, %subreg.sub1 + ; CHECK-NEXT: %118:vreg_64, $sgpr_null = V_MAD_U64_U32_e64 [[V_LSHL_OR_B32_e64_6]], [[V_LSHL_OR_B32_e64_9]], [[REG_SEQUENCE5]], 0, implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 %118.sub0, [[GLOBAL_LOAD_DWORD1]], 0, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4.bb3: + ; CHECK-NEXT: successors: %bb.7(0x40000000), %bb.5(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[PHI2]], %bb.1, [[COPY20]], %bb.2 + ; CHECK-NEXT: [[PHI4:%[0-9]+]]:vgpr_32 = PHI [[V_ADD_U32_e64_]], %bb.1, [[COPY20]], %bb.2 + ; CHECK-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, [[COPY21]], %bb.2 + ; CHECK-NEXT: [[PHI6:%[0-9]+]]:vgpr_32 = PHI [[PHI]], %bb.1, [[COPY19]], %bb.2 + ; CHECK-NEXT: SI_END_CF [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_LSHL_OR_B32_e64_]], %subreg.sub0, undef %446:vgpr_32, %subreg.sub1 + ; CHECK-NEXT: %127:vreg_64, $sgpr_null = V_MAD_U64_U32_e64 [[PHI3]], [[GLOBAL_LOAD_DWORD1]], [[REG_SEQUENCE6]], 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE7:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[PHI4]], %subreg.sub0, undef %444:vgpr_32, %subreg.sub1 + ; CHECK-NEXT: %133:vreg_64, $sgpr_null = V_MAD_U64_U32_e64 %127.sub0, [[V_ADD_U32_e64_]], [[REG_SEQUENCE7]], 0, implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_1:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 %133.sub0, [[PHI5]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 %133.sub0, [[PHI6]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[PHI3]], [[PHI5]], implicit $exec + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array2, target-flags(amdgpu-gotprel32-hi) @array2, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: [[COPY22:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] + ; CHECK-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY22]], 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array2, i64 20)`) + ; CHECK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[FLAT_LOAD_DWORD]], [[PHI6]], 0, implicit $exec + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET1:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array4, target-flags(amdgpu-gotprel32-hi) @array4, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET1]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: [[COPY23:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM1]] + ; CHECK-NEXT: FLAT_STORE_DWORD [[COPY23]], [[V_ADD_U32_e64_2]], 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr getelementptr inbounds nuw (i8, ptr @array4, i64 4)`) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE12:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE4]], 20, 0, implicit $exec :: (load (s8) from %ir.p3 + 20, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE13:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE4]], 21, 0, implicit $exec :: (load (s8) from %ir.p3 + 21, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_13:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE13]], 8, [[GLOBAL_LOAD_UBYTE12]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE14:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE4]], 22, 0, implicit $exec :: (load (s8) from %ir.p3 + 22, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE15:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE4]], 23, 0, implicit $exec :: (load (s8) from %ir.p3 + 23, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_14:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE15]], 8, [[GLOBAL_LOAD_UBYTE14]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_15:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_14]], 16, [[V_LSHL_OR_B32_e64_13]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE16:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE4]], 0, 0, implicit $exec :: (load (s8) from %ir.p3, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE17:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE4]], 1, 0, implicit $exec :: (load (s8) from %ir.p3 + 1, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_16:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE17]], 8, [[GLOBAL_LOAD_UBYTE16]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE18:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE4]], 2, 0, implicit $exec :: (load (s8) from %ir.p3 + 2, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE19:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE4]], 3, 0, implicit $exec :: (load (s8) from %ir.p3 + 3, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_17:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE19]], 8, [[GLOBAL_LOAD_UBYTE18]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_18:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_17]], 16, [[V_LSHL_OR_B32_e64_16]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE20:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE4]], 28, 0, implicit $exec :: (load (s8) from %ir.p3 + 28, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE21:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE4]], 29, 0, implicit $exec :: (load (s8) from %ir.p3 + 29, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_19:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE21]], 8, [[GLOBAL_LOAD_UBYTE20]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE22:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE4]], 30, 0, implicit $exec :: (load (s8) from %ir.p3 + 30, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE23:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE4]], 31, 0, implicit $exec :: (load (s8) from %ir.p3 + 31, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_20:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE23]], 8, [[GLOBAL_LOAD_UBYTE22]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_21:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_20]], 16, [[V_LSHL_OR_B32_e64_19]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE24:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE4]], 24, 0, implicit $exec :: (load (s8) from %ir.p3 + 24, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE25:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE4]], 25, 0, implicit $exec :: (load (s8) from %ir.p3 + 25, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_22:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE25]], 8, [[GLOBAL_LOAD_UBYTE24]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE26:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE4]], 26, 0, implicit $exec :: (load (s8) from %ir.p3 + 26, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE27:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE4]], 27, 0, implicit $exec :: (load (s8) from %ir.p3 + 27, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_23:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE27]], 8, [[GLOBAL_LOAD_UBYTE26]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_24:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_23]], 16, [[V_LSHL_OR_B32_e64_22]], implicit $exec + ; CHECK-NEXT: [[FLAT_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY22]], 28, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array2, i64 28)`) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[FLAT_LOAD_DWORD1]], [[V_MUL_LO_U32_e64_2]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_2]], [[PHI5]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_4:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[PHI6]], [[FLAT_LOAD_DWORD]], implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_2:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_MUL_LO_U32_e64_4]], [[V_MUL_LO_U32_e64_3]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_5:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_ADD_U32_e64_3]], [[V_SUB_U32_e64_2]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MUL_LO_U32_e64_5]], [[V_LSHL_OR_B32_e64_24]], 0, implicit $exec + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET2:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array3, target-flags(amdgpu-gotprel32-hi) @array3, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET2]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: [[COPY24:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM2]] + ; CHECK-NEXT: FLAT_STORE_DWORD [[COPY24]], [[V_ADD_U32_e64_4]], 68, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr getelementptr inbounds nuw (i8, ptr @array3, i64 68)`) + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET3:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array5, target-flags(amdgpu-gotprel32-hi) @array5, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM3:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET3]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: [[COPY25:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM3]] + ; CHECK-NEXT: [[FLAT_LOAD_DWORD2:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY25]], 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array5, i64 20)`) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_6:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[FLAT_LOAD_DWORD2]], [[V_LSHL_OR_B32_e64_21]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MUL_LO_U32_e64_6]], [[PHI4]], 0, implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_3:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[FLAT_LOAD_DWORD]], [[V_LSHL_OR_B32_e64_21]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_7:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[PHI6]], [[PHI5]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_5]], [[V_ADD_U32_e64_3]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MUL_LO_U32_e64_6]], [[V_MUL_LO_U32_e64_5]], 0, implicit $exec + ; CHECK-NEXT: SI_SPILL_V64_SAVE [[REG_SEQUENCE1]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: [[V_SUB_U32_e64_4:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_LSHL_OR_B32_e64_21]], [[PHI6]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_SUB_U32_e64_3]], [[FLAT_LOAD_DWORD1]], 0, implicit $exec + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET4:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array6, target-flags(amdgpu-gotprel32-hi) @array6, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM4:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET4]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[COPY2]], %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) + ; CHECK-NEXT: [[COPY26:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM4]] + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[COPY3]], %stack.2, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) + ; CHECK-NEXT: [[FLAT_LOAD_DWORD3:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY26]], 44, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array6, i64 44)`) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[COPY]], %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_8:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[FLAT_LOAD_DWORD3]], [[V_SUB_U32_e64_3]], implicit $exec + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET5:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array7, target-flags(amdgpu-gotprel32-hi) @array7, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM5:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET5]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[COPY1]], %stack.4, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5) + ; CHECK-NEXT: [[COPY27:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM5]] + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[COPY8]], %stack.5, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5) + ; CHECK-NEXT: [[FLAT_LOAD_DWORD4:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY27]], 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array7, i64 20)`) + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET6:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array8, target-flags(amdgpu-gotprel32-hi) @array8, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM6:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET6]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[COPY9]], %stack.6, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[COPY7]], %stack.7, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5) + ; CHECK-NEXT: [[COPY28:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM6]] + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[COPY6]], %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5) + ; CHECK-NEXT: [[FLAT_LOAD_DWORD5:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY28]], 44, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array8, i64 44)`, align 8) + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET7:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array9, target-flags(amdgpu-gotprel32-hi) @array9, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM7:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET7]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[GLOBAL_LOAD_DWORD1]], %stack.9, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V64_SAVE [[REG_SEQUENCE2]], %stack.10, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.10, align 4, addrspace 5) + ; CHECK-NEXT: [[COPY29:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM7]] + ; CHECK-NEXT: [[FLAT_LOAD_DWORD6:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY29]], 24, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array9, i64 24)`) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_9:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[FLAT_LOAD_DWORD6]], [[V_ADD_U32_e64_4]], implicit $exec + ; CHECK-NEXT: [[FLAT_LOAD_DWORD7:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY22]], 84, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array2, i64 84)`) + ; CHECK-NEXT: SI_SPILL_V64_SAVE [[REG_SEQUENCE]], %stack.11, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.11, align 4, addrspace 5) + ; CHECK-NEXT: [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[FLAT_LOAD_DWORD7]], [[V_MUL_LO_U32_e64_5]], 0, implicit $exec + ; CHECK-NEXT: [[FLAT_LOAD_DWORD8:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY24]], 80, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array3, i64 80)`) + ; CHECK-NEXT: [[FLAT_LOAD_DWORD9:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY23]], 80, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array4, i64 80)`, align 8) + ; CHECK-NEXT: [[FLAT_LOAD_DWORD10:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY25]], 88, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array5, i64 88)`) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[V_LSHL_OR_B32_e64_15]], %stack.12, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.12, addrspace 5) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_10:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[FLAT_LOAD_DWORD10]], [[V_ADD_U32_e64_3]], implicit $exec + ; CHECK-NEXT: [[FLAT_LOAD_DWORD11:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY28]], 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array8, i64 20)`) + ; CHECK-NEXT: [[V_ADD_U32_e64_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[FLAT_LOAD_DWORD11]], [[V_MUL_LO_U32_e64_3]], 0, implicit $exec + ; CHECK-NEXT: [[FLAT_LOAD_DWORD12:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY23]], 8, 0, implicit $exec, implicit $flat_scr :: (dereferenceable load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array4, i64 8)`) + ; CHECK-NEXT: [[FLAT_LOAD_DWORD13:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY22]], 12, 0, implicit $exec, implicit $flat_scr :: (dereferenceable load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array2, i64 12)`) + ; CHECK-NEXT: [[FLAT_LOAD_DWORD14:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY24]], 4, 0, implicit $exec, implicit $flat_scr :: (dereferenceable load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array3, i64 4)`) + ; CHECK-NEXT: [[FLAT_LOAD_DWORD15:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY25]], 4, 0, implicit $exec, implicit $flat_scr :: (dereferenceable load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array5, i64 4)`) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_11:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[FLAT_LOAD_DWORD15]], [[V_MUL_LO_U32_e64_6]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE28:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE3]], 16, 0, implicit $exec :: (load (s8) from %ir.p4 + 16, addrspace 1) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[FLAT_LOAD_DWORD15]], %stack.13, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.13, addrspace 5) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE29:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE3]], 17, 0, implicit $exec :: (load (s8) from %ir.p4 + 17, addrspace 1) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[V_LSHL_OR_B32_e64_18]], %stack.14, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.14, addrspace 5) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_25:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE29]], 8, [[GLOBAL_LOAD_UBYTE28]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE30:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE3]], 18, 0, implicit $exec :: (load (s8) from %ir.p4 + 18, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE31:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE3]], 19, 0, implicit $exec :: (load (s8) from %ir.p4 + 19, addrspace 1) + ; CHECK-NEXT: SI_SPILL_V64_SAVE [[COPY26]], %stack.15, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.15, align 4, addrspace 5) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_26:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE31]], 8, [[GLOBAL_LOAD_UBYTE30]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_27:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_26]], 16, [[V_LSHL_OR_B32_e64_25]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE32:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE3]], 12, 0, implicit $exec :: (load (s8) from %ir.p4 + 12, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE33:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE3]], 13, 0, implicit $exec :: (load (s8) from %ir.p4 + 13, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_28:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE33]], 8, [[GLOBAL_LOAD_UBYTE32]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE34:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE3]], 14, 0, implicit $exec :: (load (s8) from %ir.p4 + 14, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE35:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE3]], 15, 0, implicit $exec :: (load (s8) from %ir.p4 + 15, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_29:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE35]], 8, [[GLOBAL_LOAD_UBYTE34]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_30:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_29]], 16, [[V_LSHL_OR_B32_e64_28]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_12:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_LSHL_OR_B32_e64_30]], [[V_SUB_U32_e64_3]], implicit $exec + ; CHECK-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 [[V_LSHL_OR_B32_e64_27]], [[V_ADD_U32_e64_5]], [[V_MUL_LO_U32_e64_12]], implicit $exec + ; CHECK-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[V_MUL_LO_U32_e64_11]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_RCP_IFLAG_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_IFLAG_F32_e64 0, [[V_CVT_F32_U32_e64_]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_MUL_F32_e64 0, 1333788670, 0, [[V_RCP_IFLAG_F32_e64_]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_CVT_U32_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, [[V_MUL_F32_e64_]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_5:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 0, [[V_MUL_LO_U32_e64_11]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_13:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_SUB_U32_e64_5]], [[V_CVT_U32_F32_e64_]], implicit $exec + ; CHECK-NEXT: [[V_MUL_HI_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[V_CVT_U32_F32_e64_]], [[V_MUL_LO_U32_e64_13]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_CVT_U32_F32_e64_]], [[V_MUL_HI_U32_e64_]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_HI_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[V_ADD3_U32_e64_]], [[V_ADD_U32_e64_11]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_14:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_MUL_HI_U32_e64_1]], [[V_MUL_LO_U32_e64_11]], implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_6:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_ADD3_U32_e64_]], [[V_MUL_LO_U32_e64_14]], 0, implicit $exec + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_GE_U32_e64 [[V_SUB_U32_e64_6]], [[V_MUL_LO_U32_e64_11]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 1, [[V_MUL_HI_U32_e64_1]], 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MUL_HI_U32_e64_1]], 0, [[V_ADD_U32_e64_12]], [[V_CMP_GE_U32_e64_]], implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_7:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_SUB_U32_e64_6]], [[V_MUL_LO_U32_e64_11]], 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_SUB_U32_e64_6]], 0, [[V_SUB_U32_e64_7]], [[V_CMP_GE_U32_e64_]], implicit $exec + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_GE_U32_e64 [[V_CNDMASK_B32_e64_1]], [[V_MUL_LO_U32_e64_11]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 1, [[V_CNDMASK_B32_e64_]], 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_CNDMASK_B32_e64_]], 0, [[V_ADD_U32_e64_13]], [[V_CMP_GE_U32_e64_1]], implicit $exec + ; CHECK-NEXT: [[V_ADD3_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 [[FLAT_LOAD_DWORD14]], [[V_ADD_U32_e64_2]], [[V_CNDMASK_B32_e64_2]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_15:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_ADD3_U32_e64_1]], [[FLAT_LOAD_DWORD10]], implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_8:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_MUL_LO_U32_e64_15]], [[FLAT_LOAD_DWORD6]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD3_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 [[FLAT_LOAD_DWORD13]], [[V_ADD_U32_e64_4]], [[V_SUB_U32_e64_8]], implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE8:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_SUB_U32_e64_1]], %subreg.sub0, undef %442:vgpr_32, %subreg.sub1 + ; CHECK-NEXT: %283:vreg_64, $sgpr_null = V_MAD_U64_U32_e64 [[V_ADD3_U32_e64_2]], [[V_MUL_LO_U32_e64_1]], [[REG_SEQUENCE8]], 0, implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_9:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 %283.sub0, [[V_LSHL_OR_B32_e64_24]], 0, implicit $exec + ; CHECK-NEXT: [[V_CVT_F32_U32_e64_1:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[V_LSHL_OR_B32_e64_21]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_RCP_IFLAG_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_IFLAG_F32_e64 0, [[V_CVT_F32_U32_e64_1]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MUL_F32_e64_1:%[0-9]+]]:vgpr_32 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_MUL_F32_e64 0, 1333788670, 0, [[V_RCP_IFLAG_F32_e64_1]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_CVT_U32_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, [[V_MUL_F32_e64_1]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_10:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 0, [[V_LSHL_OR_B32_e64_21]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_16:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_SUB_U32_e64_10]], [[V_CVT_U32_F32_e64_1]], implicit $exec + ; CHECK-NEXT: [[V_MUL_HI_U32_e64_2:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[V_CVT_U32_F32_e64_1]], [[V_MUL_LO_U32_e64_16]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_CVT_U32_F32_e64_1]], [[V_MUL_HI_U32_e64_2]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_HI_U32_e64_3:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[V_SUB_U32_e64_9]], [[V_ADD_U32_e64_14]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_17:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_MUL_HI_U32_e64_3]], [[V_LSHL_OR_B32_e64_21]], implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_11:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_SUB_U32_e64_9]], [[V_MUL_LO_U32_e64_17]], 0, implicit $exec + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_2:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_GE_U32_e64 [[V_SUB_U32_e64_11]], [[V_LSHL_OR_B32_e64_21]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 1, [[V_MUL_HI_U32_e64_3]], 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_3:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MUL_HI_U32_e64_3]], 0, [[V_ADD_U32_e64_15]], [[V_CMP_GE_U32_e64_2]], implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_12:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_SUB_U32_e64_11]], [[V_LSHL_OR_B32_e64_21]], 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_4:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_SUB_U32_e64_11]], 0, [[V_SUB_U32_e64_12]], [[V_CMP_GE_U32_e64_2]], implicit $exec + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_GE_U32_e64 [[V_CNDMASK_B32_e64_4]], [[V_LSHL_OR_B32_e64_21]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 1, [[V_CNDMASK_B32_e64_3]], 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_5:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_CNDMASK_B32_e64_3]], 0, [[V_ADD_U32_e64_16]], [[V_CMP_GE_U32_e64_3]], implicit $exec + ; CHECK-NEXT: DS_WRITE_B8_D16_HI [[COPY16]], [[V_CNDMASK_B32_e64_5]], 2, 0, implicit $exec :: (store (s8) into %ir.p2 + 2, addrspace 3) + ; CHECK-NEXT: DS_WRITE_B8_gfx9 [[COPY16]], [[V_CNDMASK_B32_e64_5]], 0, 0, implicit $exec :: (store (s8) into %ir.p2, addrspace 3) + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 24, [[V_CNDMASK_B32_e64_5]], implicit $exec + ; CHECK-NEXT: DS_WRITE_B8_gfx9 [[COPY16]], [[V_LSHRREV_B32_e64_]], 3, 0, implicit $exec :: (store (s8) into %ir.p2 + 3, addrspace 3) + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 8, [[V_CNDMASK_B32_e64_5]], implicit $exec + ; CHECK-NEXT: DS_WRITE_B8_gfx9 [[COPY16]], [[V_LSHRREV_B32_e64_1]], 1, 0, implicit $exec :: (store (s8) into %ir.p2 + 1, addrspace 3) + ; CHECK-NEXT: [[V_SUB_U32_e64_13:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_CNDMASK_B32_e64_5]], [[FLAT_LOAD_DWORD11]], 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE9:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[FLAT_LOAD_DWORD5]], %subreg.sub0, undef %440:vgpr_32, %subreg.sub1 + ; CHECK-NEXT: %313:vreg_64, $sgpr_null = V_MAD_U64_U32_e64 [[V_SUB_U32_e64_13]], [[V_LSHL_OR_B32_e64_27]], [[REG_SEQUENCE9]], 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE10:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[FLAT_LOAD_DWORD]], %subreg.sub0, undef %438:vgpr_32, %subreg.sub1 + ; CHECK-NEXT: %319:vreg_64, $sgpr_null = V_MAD_U64_U32_e64 %313.sub0, [[FLAT_LOAD_DWORD1]], [[REG_SEQUENCE10]], 0, implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_14:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 %319.sub0, [[V_MUL_LO_U32_e64_2]], 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE11:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_SUB_U32_e64_14]], %subreg.sub0, undef %436:vgpr_32, %subreg.sub1 + ; CHECK-NEXT: %325:vreg_64, $sgpr_null = V_MAD_U64_U32_e64 [[FLAT_LOAD_DWORD12]], [[V_ADD_U32_e64_2]], [[REG_SEQUENCE11]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_18:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 %325.sub0, [[V_ADD_U32_e64_10]], implicit $exec + ; CHECK-NEXT: [[V_CVT_F32_U32_e64_2:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[V_MUL_LO_U32_e64_10]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_RCP_IFLAG_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_IFLAG_F32_e64 0, [[V_CVT_F32_U32_e64_2]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MUL_F32_e64_2:%[0-9]+]]:vgpr_32 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_MUL_F32_e64 0, 1333788670, 0, [[V_RCP_IFLAG_F32_e64_2]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_CVT_U32_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, [[V_MUL_F32_e64_2]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_15:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 0, [[V_MUL_LO_U32_e64_10]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_19:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_SUB_U32_e64_15]], [[V_CVT_U32_F32_e64_2]], implicit $exec + ; CHECK-NEXT: [[V_MUL_HI_U32_e64_4:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[V_CVT_U32_F32_e64_2]], [[V_MUL_LO_U32_e64_19]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_CVT_U32_F32_e64_2]], [[V_MUL_HI_U32_e64_4]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_HI_U32_e64_5:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[V_MUL_LO_U32_e64_18]], [[V_ADD_U32_e64_17]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_20:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_MUL_HI_U32_e64_5]], [[V_MUL_LO_U32_e64_10]], implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_16:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_MUL_LO_U32_e64_18]], [[V_MUL_LO_U32_e64_20]], 0, implicit $exec + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_4:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_GE_U32_e64 [[V_SUB_U32_e64_16]], [[V_MUL_LO_U32_e64_10]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 1, [[V_MUL_HI_U32_e64_5]], 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_6:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MUL_HI_U32_e64_5]], 0, [[V_ADD_U32_e64_18]], [[V_CMP_GE_U32_e64_4]], implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_17:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_SUB_U32_e64_16]], [[V_MUL_LO_U32_e64_10]], 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_7:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_SUB_U32_e64_16]], 0, [[V_SUB_U32_e64_17]], [[V_CMP_GE_U32_e64_4]], implicit $exec + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_5:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_GE_U32_e64 [[V_CNDMASK_B32_e64_7]], [[V_MUL_LO_U32_e64_10]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 1, [[V_CNDMASK_B32_e64_6]], 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_8:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_CNDMASK_B32_e64_6]], 0, [[V_ADD_U32_e64_19]], [[V_CMP_GE_U32_e64_5]], implicit $exec + ; CHECK-NEXT: [[V_ADD3_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 [[FLAT_LOAD_DWORD9]], [[V_MUL_LO_U32_e64_4]], [[V_CNDMASK_B32_e64_8]], implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_18:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_SUB_U32_e64_2]], [[FLAT_LOAD_DWORD8]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD3_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 [[V_ADD3_U32_e64_3]], [[V_SUB_U32_e64_18]], [[V_ADD_U32_e64_9]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_21:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_ADD3_U32_e64_4]], [[V_MUL_LO_U32_e64_9]], implicit $exec + ; CHECK-NEXT: [[V_ADD3_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 [[FLAT_LOAD_DWORD5]], [[V_MUL_LO_U32_e64_6]], [[V_MUL_LO_U32_e64_21]], implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_19:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_ADD_U32_e64_5]], [[FLAT_LOAD_DWORD4]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD3_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 [[V_ADD3_U32_e64_5]], [[V_SUB_U32_e64_19]], [[V_MUL_LO_U32_e64_8]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_22:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_ADD3_U32_e64_6]], [[V_ADD_U32_e64_2]], implicit $exec + ; CHECK-NEXT: FLAT_STORE_DWORD [[COPY27]], [[V_MUL_LO_U32_e64_22]], 68, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr getelementptr inbounds nuw (i8, ptr @array7, i64 68)`) + ; CHECK-NEXT: [[V_ADD3_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 [[FLAT_LOAD_DWORD]], [[V_ADD_U32_e64_4]], [[V_MUL_LO_U32_e64_22]], implicit $exec + ; CHECK-NEXT: [[V_CVT_F32_U32_e64_3:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[V_ADD_U32_e64_8]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_RCP_IFLAG_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_IFLAG_F32_e64 0, [[V_CVT_F32_U32_e64_3]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MUL_F32_e64_3:%[0-9]+]]:vgpr_32 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_MUL_F32_e64 0, 1333788670, 0, [[V_RCP_IFLAG_F32_e64_3]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_CVT_U32_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, [[V_MUL_F32_e64_3]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_20:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 0, [[V_ADD_U32_e64_8]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_23:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_SUB_U32_e64_20]], [[V_CVT_U32_F32_e64_3]], implicit $exec + ; CHECK-NEXT: [[V_MUL_HI_U32_e64_6:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[V_CVT_U32_F32_e64_3]], [[V_MUL_LO_U32_e64_23]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_CVT_U32_F32_e64_3]], [[V_MUL_HI_U32_e64_6]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_HI_U32_e64_7:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[V_ADD3_U32_e64_7]], [[V_ADD_U32_e64_20]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_24:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_MUL_HI_U32_e64_7]], [[V_ADD_U32_e64_8]], implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_21:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_ADD3_U32_e64_7]], [[V_MUL_LO_U32_e64_24]], 0, implicit $exec + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_6:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_GE_U32_e64 [[V_SUB_U32_e64_21]], [[V_ADD_U32_e64_8]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 1, [[V_MUL_HI_U32_e64_7]], 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_9:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MUL_HI_U32_e64_7]], 0, [[V_ADD_U32_e64_21]], [[V_CMP_GE_U32_e64_6]], implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_22:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_SUB_U32_e64_21]], [[V_ADD_U32_e64_8]], 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_10:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_SUB_U32_e64_21]], 0, [[V_SUB_U32_e64_22]], [[V_CMP_GE_U32_e64_6]], implicit $exec + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_7:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_GE_U32_e64 [[V_CNDMASK_B32_e64_10]], [[V_ADD_U32_e64_8]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 1, [[V_CNDMASK_B32_e64_9]], 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_11:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_CNDMASK_B32_e64_9]], 0, [[V_ADD_U32_e64_22]], [[V_CMP_GE_U32_e64_7]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_25:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_CNDMASK_B32_e64_11]], [[V_SUB_U32_e64_4]], implicit $exec + ; CHECK-NEXT: [[V_ADD3_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 [[V_MUL_LO_U32_e64_25]], [[V_ADD_U32_e64_7]], [[V_ADD_U32_e64_6]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_26:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_ADD3_U32_e64_8]], [[V_MUL_LO_U32_e64_7]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_27:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_MUL_LO_U32_e64_26]], [[V_LSHL_OR_B32_e64_3]], implicit $exec + ; CHECK-NEXT: [[FLAT_LOAD_DWORD16:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY24]], 84, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array3, i64 84)`) + ; CHECK-NEXT: [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[FLAT_LOAD_DWORD16]], [[V_LSHL_OR_B32_e64_27]], 0, implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_23:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_ADD_U32_e64_23]], [[V_MUL_LO_U32_e64_2]], 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE12:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MUL_LO_U32_e64_27]], %subreg.sub0, [[V_SUB_U32_e64_23]], %subreg.sub1 + ; CHECK-NEXT: FLAT_STORE_DWORDX2 [[COPY23]], [[REG_SEQUENCE12]], 76, 0, implicit $exec, implicit $flat_scr :: (store (s64) into `ptr getelementptr inbounds nuw (i8, ptr @array4, i64 76)`, align 4) + ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.15, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.15, align 4, addrspace 5) + ; CHECK-NEXT: [[FLAT_LOAD_DWORD17:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[SI_SPILL_V64_RESTORE]], 28, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array6, i64 28)`) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.14, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.14, addrspace 5) + ; CHECK-NEXT: [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[FLAT_LOAD_DWORD17]], [[SI_SPILL_V32_RESTORE]], 0, implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE1:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.12, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.12, addrspace 5) + ; CHECK-NEXT: [[V_SUB_U32_e64_24:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[SI_SPILL_V32_RESTORE1]], [[V_ADD_U32_e64_2]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32 = V_ADD_CO_U32_e64 [[V_ADD_U32_e64_24]], [[V_SUB_U32_e64_24]], 0, implicit $exec + ; CHECK-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_ADD_CO_U32_e64_1]], -1, implicit-def dead $scc + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_28:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_ADD_CO_U32_e64_]], [[V_MUL_LO_U32_e64_7]], implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE1:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.11, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.11, align 4, addrspace 5) + ; CHECK-NEXT: GLOBAL_STORE_SHORT_D16_HI [[SI_SPILL_V64_RESTORE1]], [[V_MUL_LO_U32_e64_28]], 2, 0, implicit $exec :: (store (s16) into %ir.p8 + 2, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_SHORT [[SI_SPILL_V64_RESTORE1]], [[V_MUL_LO_U32_e64_28]], 0, 0, implicit $exec :: (store (s16) into %ir.p8, addrspace 1) + ; CHECK-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF [[S_XOR_B32_]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5.Flow5: + ; CHECK-NEXT: successors: %bb.6(0x40000000), %bb.8(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI7:%[0-9]+]]:vreg_64 = PHI [[REG_SEQUENCE3]], %bb.4, undef %464:vreg_64, %bb.7 + ; CHECK-NEXT: [[PHI8:%[0-9]+]]:vgpr_32 = PHI [[V_SUB_U32_e64_24]], %bb.4, undef %466:vgpr_32, %bb.7 + ; CHECK-NEXT: [[SI_ELSE1:%[0-9]+]]:sreg_32 = SI_ELSE [[SI_IF1]], %bb.8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.6 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.6.bb4: + ; CHECK-NEXT: successors: %bb.8(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[PHI7]], [[PHI8]], 0, 0, implicit $exec :: (store (s32) into %ir.p4, addrspace 1) + ; CHECK-NEXT: S_BRANCH %bb.8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.7.bb5: + ; CHECK-NEXT: successors: %bb.5(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE2:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.10, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.10, align 4, addrspace 5) + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[SI_SPILL_V64_RESTORE2]], [[V_ADD_CO_U32_e64_]], 0, 0, implicit $exec :: (store (s32) into %ir.p5, addrspace 1) + ; CHECK-NEXT: S_BRANCH %bb.5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.8.bb6: + ; CHECK-NEXT: successors: %bb.11(0x40000000), %bb.9(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: SI_END_CF [[SI_ELSE1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE2:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) + ; CHECK-NEXT: [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GT_U32_e64 [[SI_SPILL_V32_RESTORE2]], [[FLAT_LOAD_DWORD16]], implicit $exec + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE3:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE4:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5) + ; CHECK-NEXT: [[SI_IF2:%[0-9]+]]:sreg_32 = SI_IF [[V_CMP_GT_U32_e64_]], %bb.9, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.9.Flow: + ; CHECK-NEXT: successors: %bb.10(0x40000000), %bb.12(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI9:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.8, %458, %bb.11 + ; CHECK-NEXT: [[PHI10:%[0-9]+]]:vgpr_32 = PHI [[SI_SPILL_V32_RESTORE4]], %bb.8, undef %468:vgpr_32, %bb.11 + ; CHECK-NEXT: [[PHI11:%[0-9]+]]:vgpr_32 = PHI [[SI_SPILL_V32_RESTORE3]], %bb.8, undef %470:vgpr_32, %bb.11 + ; CHECK-NEXT: [[PHI12:%[0-9]+]]:vgpr_32 = PHI [[V_ADD_U32_e64_24]], %bb.8, undef %472:vgpr_32, %bb.11 + ; CHECK-NEXT: [[PHI13:%[0-9]+]]:vreg_64 = PHI %319, %bb.8, undef %474:vreg_64, %bb.11 + ; CHECK-NEXT: [[SI_ELSE2:%[0-9]+]]:sreg_32 = SI_ELSE [[SI_IF2]], %bb.12, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.10.bb7: + ; CHECK-NEXT: successors: %bb.12(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[REG_SEQUENCE13:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[PHI10]], %subreg.sub0, [[PHI11]], %subreg.sub1 + ; CHECK-NEXT: [[V_ADD_U32_e64_25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI12]], [[PHI13]].sub0, 0, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE13]], [[V_ADD_U32_e64_25]], 0, 0, implicit $exec :: (store (s32) into %ir.p6, addrspace 1) + ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI9]], $exec_lo, implicit-def dead $scc + ; CHECK-NEXT: S_BRANCH %bb.12 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.11.bb8: + ; CHECK-NEXT: successors: %bb.9(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE5:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.7, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE6:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5) + ; CHECK-NEXT: [[REG_SEQUENCE14:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[SI_SPILL_V32_RESTORE5]], %subreg.sub0, [[SI_SPILL_V32_RESTORE6]], %subreg.sub1 + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE7:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.13, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.13, addrspace 5) + ; CHECK-NEXT: [[V_ADD_U32_e64_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MUL_LO_U32_e64_28]], [[SI_SPILL_V32_RESTORE7]], 0, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE14]], [[V_ADD_U32_e64_26]], 0, 0, implicit $exec :: (store (s32) into %ir.p7, addrspace 1) + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_ADD_CO_U32_e64_1]], $exec_lo, implicit-def dead $scc + ; CHECK-NEXT: [[COPY30:%[0-9]+]]:sreg_32 = COPY [[S_AND_B32_]] + ; CHECK-NEXT: S_BRANCH %bb.9 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.12.Flow4: + ; CHECK-NEXT: successors: %bb.13(0x40000000), %bb.14(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI14:%[0-9]+]]:sreg_32 = PHI [[PHI9]], %bb.9, [[S_OR_B32_]], %bb.10 + ; CHECK-NEXT: SI_END_CF [[SI_ELSE2]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE8:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE9:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5) + ; CHECK-NEXT: [[REG_SEQUENCE15:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[SI_SPILL_V32_RESTORE9]], %subreg.sub0, [[SI_SPILL_V32_RESTORE8]], %subreg.sub1 + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE10:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE11:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; CHECK-NEXT: [[REG_SEQUENCE16:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[SI_SPILL_V32_RESTORE11]], %subreg.sub0, [[SI_SPILL_V32_RESTORE10]], %subreg.sub1 + ; CHECK-NEXT: [[SI_IF3:%[0-9]+]]:sreg_32 = SI_IF [[PHI14]], %bb.14, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.13 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.13.bb9: + ; CHECK-NEXT: successors: %bb.14(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_ADD_U32_e64_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[SI_SPILL_V32_RESTORE]], [[V_LSHL_OR_B32_e64_3]], 0, implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE3:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[SI_SPILL_V64_RESTORE3]], [[V_ADD_U32_e64_27]], 0, 0, implicit $exec :: (store (s32) into %ir.p1, addrspace 1) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.14.bb10: + ; CHECK-NEXT: SI_END_CF [[SI_IF3]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE36:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE15]], 0, 0, implicit $exec :: (load (s8) from %ir.p10, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE37:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE15]], 1, 0, implicit $exec :: (load (s8) from %ir.p10 + 1, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_31:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE37]], 8, [[GLOBAL_LOAD_UBYTE36]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE38:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE15]], 2, 0, implicit $exec :: (load (s8) from %ir.p10 + 2, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE39:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE15]], 3, 0, implicit $exec :: (load (s8) from %ir.p10 + 3, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_32:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE39]], 8, [[GLOBAL_LOAD_UBYTE38]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_33:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_32]], 16, [[V_LSHL_OR_B32_e64_31]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_LSHL_OR_B32_e64_33]], [[V_LSHL_OR_B32_e64_3]], 0, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE16]], [[V_ADD_U32_e64_28]], 0, 0, implicit $exec :: (store (s32) into %ir.p9, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 +entry: + %ld = load i32, ptr addrspace(1) %p4, align 2 + %ld0 = load i32, ptr addrspace(1) %p5, align 8 + %ld1 = load i32, ptr addrspace(1) %p1, align 4 + %gep1 = getelementptr inbounds i32, ptr addrspace(1) %p1, i64 3 + %ld2 = load i32, ptr addrspace(1) %gep1, align 1 + %tmp1 = add i32 %ld, %ld0 + store i32 %tmp1, ptr addrspace(1) %p1 + %cond1 = icmp uge i32 %ld0, %tmp1 + br i1 %cond1, label %bb1, label %bb2 + +bb1: + %load1 = load i32, ptr addrspace(1) %p3, align 4 + %load2 = load <8 x i32>, ptr addrspace(1) %p3, align 1 + %extract1 = extractelement < 8 x i32> %load2, i32 1 + %extract2 = extractelement < 8 x i32> %load2, i32 2 + %tmp84 = add i32 %load1, %extract1 + %tmp85 = mul i32 %tmp84, %extract2 + store i32 %tmp85, ptr addrspace(1) %p3 + store i32 %tmp84, ptr addrspace(1) %p8 + br label %bb3 + +bb2: + %ld3 = load i32, ptr addrspace(3) %p2, align 1 + %load4 = load <8 x i32>, ptr addrspace(1) %p8, align 1 + %extract11 = extractelement < 8 x i32> %load4, i32 1 + %extract12 = extractelement < 8 x i32> %load4, i32 2 + %tmp70 = mul i32 %ld3, %extract11 + %tmp71 = add i32 %tmp70, %extract12 + %tmp72 = sub i32 %tmp71, %ld1 + br label %bb3 + +bb3: + %phi1 = phi i32 [ %load1, %bb1 ], [ %tmp72, %bb2 ] + %phi2 = phi i32 [ %load1, %bb1 ], [ %tmp1, %bb2 ] + %phi3 = phi i32 [ %extract1, %bb1 ], [ %extract11, %bb2 ] + %phi4 = phi i32 [ %extract2, %bb1 ], [ %extract12, %bb2 ] + %tmp73 = mul i32 %phi1, %ld1 + %tmp74 = add i32 %tmp73, %ld + %tmp75 = mul i32 %tmp74, %tmp1 + %tmp76 = add i32 %tmp75, %phi2 + %tmp77 = sub i32 %tmp76, %phi3 + %tmp78 = mul i32 %tmp76, %phi4 + %tmp2 = mul i32 %phi1, %phi3 + %idx10 = getelementptr inbounds [5 x i32], [5 x i32]* @array2, i64 1, i64 0 + %val0 = load i32, i32* %idx10, align 4 + %tmp3 = add i32 %val0, %phi4 + %idx20 = getelementptr inbounds [5 x i32], [5 x i32]* @array4, i64 0, i64 1 + store i32 %tmp3, i32 *%idx20 + %load22 = load <8 x i32>, ptr addrspace(1) %p3, align 1 + %extract3 = extractelement < 8 x i32> %load22, i32 6 + %idx12 = getelementptr inbounds [5 x i32], [5 x i32]* @array2, i64 1, i64 2 + %val2 = load i32, i32* %idx12, align 4 + %tmp4 = mul i32 %val2, %tmp2 + %tmp5= add i32 %tmp3, %phi3 + %tmp6 = mul i32 %phi4, %val0 + %tmp7 = sub i32 %tmp6, %tmp4 + %tmp8 = mul i32 %tmp5, %tmp7 + %tmp9 = add i32 %tmp8, %extract3 + %idx22 = getelementptr inbounds [5 x i32], [5 x i32]* @array3, i64 3, i64 2 + store i32 %tmp9, i32 *%idx22 + %extract4 = extractelement < 8 x i32> %load22, i32 7 + %idx13 = getelementptr inbounds [5 x i32], [5 x i32]* @array5, i64 1, i64 0 + %val3 = load i32, i32* %idx13, align 4 + %tmp10 = mul i32 %val3, %extract4 + %tmp11 = add i32 %tmp10, %phi2 + %tmp12 = sub i32 %val0, %extract4 + %tmp13 = mul i32 %phi4, %phi3 + %tmp14 = add i32 %tmp11, %tmp5 + %tmp15 = add i32 %tmp10, %tmp8 + %tmp16 = sub i32 %extract4, %phi4 + %tmp17 = add i32 %tmp12, %val2 + %tmp18 = add i32 %val0, %tmp9 + %idx601 = getelementptr inbounds [5 x i32], [5 x i32]* @array6, i64 2, i64 1 + %val601 = load i32, i32* %idx601, align 1 + %tmp19 = mul i32 %val601, %tmp12 + %idx701 = getelementptr inbounds [5 x i32], [5 x i32]* @array7, i64 1, i64 0 + %val701 = load i32, i32* %idx701, align 2 + %tmp20 = sub i32 %val701, %tmp11 + %idx801 = getelementptr inbounds [5 x i32], [5 x i32]* @array8, i64 2, i64 1 + %val801 = load i32, i32* %idx801, align 8 + %tmp21 = add i32 %val801, %tmp10 + %idx901 = getelementptr inbounds [5 x i32], [5 x i32]* @array9, i64 1, i64 1 + %val901 = load i32, i32* %idx901, align 1 + %tmp22 = mul i32 %val901, %tmp9 + %idx602 = getelementptr inbounds [5 x i32], [5 x i32]* @array2, i64 4, i64 1 + %val602 = load i32, i32* %idx602, align 1 + %tmp23 = add i32 %val602, %tmp8 + %idx702 = getelementptr inbounds [5 x i32], [5 x i32]* @array3, i64 4, i64 0 + %val702 = load i32, i32* %idx702, align 2 + %tmp24 = sub i32 %val702, %tmp7 + %idx802 = getelementptr inbounds [5 x i32], [5 x i32]* @array4, i64 4, i64 0 + %val802 = load i32, i32* %idx802, align 8 + %tmp25 = add i32 %val802, %tmp6 + %idx902 = getelementptr inbounds [5 x i32], [5 x i32]* @array5, i64 4, i64 2 + %val902 = load i32, i32* %idx902, align 1 + %tmp26 = mul i32 %val902, %tmp5 + %idx800 = getelementptr inbounds [5 x i32], [5 x i32]* @array8, i64 1, i64 0 + %val800 = load i32, i32* %idx800, align 4 + %tmp27 = add i32 %val800, %tmp4 + %idx15 = getelementptr inbounds [5 x i32], [5 x i32]* @array4, i64 0, i64 2 + %val5 = load i32, i32* %idx15, align 4 + %tmp28 = mul i32 %val5, %tmp3 + %idx16 = getelementptr inbounds [5 x i32], [5 x i32]* @array2, i64 0, i64 3 + %val6 = load i32, i32* %idx16, align 4 + %tmp206 = add i32 %val6, %tmp9 + %idx17 = getelementptr inbounds [5 x i32], [5 x i32]* @array3, i64 0, i64 1 + %val7 = load i32, i32* %idx17, align 4 + %tmp207 = add i32 %val7, %tmp3 + %idx18 = getelementptr inbounds [5 x i32], [5 x i32]* @array5, i64 0, i64 1 + %val8 = load i32, i32* %idx18, align 4 + %tmp208 = mul i32 %val8, %tmp10 + %load3 = load <8 x i32>, ptr addrspace(1) %p4, align 1 + %extract7 = extractelement < 8 x i32> %load3, i32 4 + %tmp209 = add i32 %extract7, %tmp11 + %extract8 = extractelement < 8 x i32> %load3, i32 3 + %tmp30 = mul i32 %extract8, %tmp12 + %tmp31 = add i32 %tmp30, %tmp209 + %tmp32 = udiv i32 %tmp31, %tmp208 + %tmp33 = add i32 %tmp32, %tmp207 + %tmp34 = mul i32 %tmp33, %val902 + %tmp35 = sub i32 %tmp34, %val901 + %tmp36 = add i32 %tmp35, %tmp206 + %tmp37 = mul i32 %tmp36, %tmp78 + %tmp38 = add i32 %tmp37, %tmp77 + %tmp39 = sub i32 %tmp38, %extract3 + %tmp40 = udiv i32 %tmp39, %extract4 + store i32 %tmp40, ptr addrspace(3) %p2, align 1 + %tmp41 = sub i32 %tmp40, %val800 + %tmp42 = mul i32 %tmp41, %extract7 + %tmp43 = add i32 %tmp42, %val801 + %tmp44 = mul i32 %tmp43, %val2 + %tmp45 = add i32 %tmp44, %val0 + %tmp46 = sub i32 %tmp45, %tmp2 + %tmp47 = add i32 %tmp46, %tmp28 + %tmp48 = mul i32 %tmp47, %tmp27 + %tmp49 = udiv i32 %tmp48, %tmp26 + %tmp50 = add i32 %tmp49, %tmp25 + %tmp51 = sub i32 %tmp50, %tmp24 + %tmp52 = add i32 %tmp51, %tmp23 + %tmp53 = mul i32 %tmp52, %tmp22 + %tmp54 = add i32 %tmp53, %tmp21 + %tmp55 = sub i32 %tmp54, %tmp20 + %tmp56 = add i32 %tmp55, %tmp19 + %tmp57 = mul i32 %tmp56, %tmp3 + %idx700 = getelementptr inbounds [5 x i32], [5 x i32]* @array7, i64 3, i64 2 + store i32 %tmp57, i32 *%idx700 + %tmp58 = add i32 %tmp57, %tmp18 + %tmp59 = udiv i32 %tmp58, %tmp17 + %tmp60 = mul i32 %tmp59, %tmp16 + %tmp61 = add i32 %tmp60, %tmp15 + %tmp62 = add i32 %tmp61, %tmp14 + %tmp63 = mul i32 %tmp62, %tmp13 + %tmp64 = mul i32 %tmp63, %ld2 + %idx23 = getelementptr inbounds [5 x i32], [5 x i32]* @array4, i64 3, i64 4 + store i32 %tmp64, i32 *%idx23 + %extract17 = extractelement < 8 x i32> %load3, i32 4 + %idx14 = getelementptr inbounds [5 x i32], [5 x i32]* @array3, i64 4, i64 1 + %val4 = load i32, i32* %idx14, align 4 + %tmp65 = add i32 %val4, %extract17 + %tmp66 = sub i32 %tmp65, %tmp2 + %idx24 = getelementptr inbounds [5 x i32], [5 x i32]* @array4, i64 3, i64 5 + store i32 %tmp66, i32 *%idx24 + %extract9 = extractelement < 8 x i32> %load22, i32 0 + %idx600 = getelementptr inbounds [5 x i32], [5 x i32]* @array6, i64 1, i64 2 + %val600 = load i32, i32* %idx600, align 4 + %tmp67 = add i32 %val600, %extract9 + %extract10 = extractelement < 8 x i32> %load22, i32 5 + %tmp68 = sub i32 %extract10, %tmp3 + %tmp69 = add i32 %tmp67, %tmp68 + %tmp79 = mul i32 %tmp69, %tmp13 + store i32 %tmp79, ptr addrspace(1) %p8, align 2 + %cond2 = icmp ult i32 %tmp69, %tmp68 + br i1 %cond2, label %bb4, label %bb5 + +bb4: + store i32 %tmp68, ptr addrspace(1) %p4 + br label %bb6 + +bb5: + store i32 %tmp69, ptr addrspace(1) %p5 + br label %bb6 + +bb6: + %tmp80 = mul i32 %tmp66, %ld2 + %cond3 = icmp ule i32 %ld1, %val4 + br i1 %cond3, label %bb7, label %bb8 + +bb7: + %tmp81 = add i32 %tmp67, %tmp45 + store i32 %tmp81, ptr addrspace(1) %p6 + br label %bb9 + +bb8: + %tmp82 = add i32 %tmp79, %val8 + store i32 %tmp82, ptr addrspace(1) %p7 + %xor = xor i1 %cond2, %cond3 + br i1 %xor, label %bb9, label %bb10 + +bb9: + %phi5 = phi i32 [ %tmp81, %bb7], [%tmp82, %bb8] + %tmp83 = add i32 %extract9, %ld2 + store i32 %tmp83, ptr addrspace(1) %p1 + br label %bb10 + +bb10: + %ld10 = load i32, ptr addrspace(1) %p10, align 1 + %tmp90 = add i32 %ld10, %ld2 + store i32 %tmp90, ptr addrspace(1) %p9, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/test_ers_multiple_spills2.ll b/llvm/test/CodeGen/AMDGPU/test_ers_multiple_spills2.ll new file mode 100644 index 0000000000000..c117907eb4d5a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/test_ers_multiple_spills2.ll @@ -0,0 +1,701 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -stop-after=amdgpu-early-register-spilling -verify-machineinstrs -dump-next-use-distance -print-after=amdgpu-early-register-spilling -max-vgprs=50 < %s 2>&1 | FileCheck %s + +@array2 = global [5 x i32] zeroinitializer, align 4 +@array3 = global [5 x i32] zeroinitializer, align 4 +@array4 = global [5 x i32] zeroinitializer, align 4 +@array5 = global [5 x i32] zeroinitializer, align 4 + +@array6 = global [5 x i32] zeroinitializer, align 4 +@array7 = global [5 x i32] zeroinitializer, align 4 +@array8 = global [5 x i32] zeroinitializer, align 4 +@array9 = global [5 x i32] zeroinitializer, align 4 + +; +; bb.0.entry +; / | +; bb.1.bb1 | +; \ | +; bb.2.bb2 +; / | +; bb.5.bb8 | +; \ | +; bb.3.Flow +; / | +; bb.4.bb7 | +; \ | +; bb.6.Flow1 +; / | +; bb.7.bb9 | +; \ | +; bb.8.bb10 +; +define amdgpu_ps void @test14(ptr addrspace(1) %p1, ptr addrspace(3) %p2, ptr addrspace(1) %p3, ptr addrspace(1) %p4, ptr addrspace(1) %p5, ptr addrspace(1) %p6, ptr addrspace(1) %p7, ptr addrspace(1) %p8, ptr addrspace(1) %p9, ptr addrspace(1) %p10, ptr addrspace(1) %p11, i32 %arg1, i32 %arg2) { + ; CHECK-LABEL: name: test14 + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr21 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr20 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr19 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr18 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr17 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr16 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr15 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr14 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr13 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr12 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY $vgpr7 + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY $vgpr6 + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY18]], %subreg.sub0, [[COPY17]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE7:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY21]], %subreg.sub0, [[COPY20]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE8:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY13]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE9:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY16]], %subreg.sub0, [[COPY15]], %subreg.sub1 + ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[REG_SEQUENCE9]], 0, 0, implicit $exec :: (load (s16) from %ir.p4, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[REG_SEQUENCE9]], 2, 0, implicit $exec :: (load (s16) from %ir.p4 + 2, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_USHORT1]], 16, [[GLOBAL_LOAD_USHORT]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE8]], 0, 0, implicit $exec :: (load (s32) from %ir.p5, align 8, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE7]], 0, 0, implicit $exec :: (load (s32) from %ir.p1, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE6]], 0, 0, implicit $exec :: (load (s8) from %ir.p9, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE6]], 1, 0, implicit $exec :: (load (s8) from %ir.p9 + 1, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE1]], 8, [[GLOBAL_LOAD_UBYTE]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE2:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE6]], 2, 0, implicit $exec :: (load (s8) from %ir.p9 + 2, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE3:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE6]], 3, 0, implicit $exec :: (load (s8) from %ir.p9 + 3, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE3]], 8, [[GLOBAL_LOAD_UBYTE2]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_3:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_2]], 16, [[V_LSHL_OR_B32_e64_1]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT2:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[REG_SEQUENCE5]], 0, 0, implicit $exec :: (load (s16) from %ir.p8, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT3:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[REG_SEQUENCE5]], 2, 0, implicit $exec :: (load (s16) from %ir.p8 + 2, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE4:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE4]], 0, 0, implicit $exec :: (load (s8) from %ir.p10, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE5:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE4]], 1, 0, implicit $exec :: (load (s8) from %ir.p10 + 1, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE6:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE4]], 2, 0, implicit $exec :: (load (s8) from %ir.p10 + 2, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE7:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE4]], 3, 0, implicit $exec :: (load (s8) from %ir.p10 + 3, addrspace 1) + ; CHECK-NEXT: [[V_SUB_U32_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_LSHL_OR_B32_e64_]], [[GLOBAL_LOAD_DWORD]], 0, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE3]], [[V_SUB_U32_e64_]], 0, 0, implicit $exec :: (store (s32) into %ir.p11, addrspace 1) + ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_LSHL_OR_B32_e64_]], [[GLOBAL_LOAD_DWORD]], 0, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE7]], [[V_ADD_U32_e64_]], 0, 0, implicit $exec :: (store (s32) into %ir.p1, addrspace 1) + ; CHECK-NEXT: [[DS_READ_U8_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY19]], 0, 0, implicit $exec :: (load (s8) from %ir.p2, addrspace 3) + ; CHECK-NEXT: [[DS_READ_U8_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY19]], 1, 0, implicit $exec :: (load (s8) from %ir.p2 + 1, addrspace 3) + ; CHECK-NEXT: [[DS_READ_U8_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY19]], 2, 0, implicit $exec :: (load (s8) from %ir.p2 + 2, addrspace 3) + ; CHECK-NEXT: [[DS_READ_U8_gfx9_3:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY19]], 3, 0, implicit $exec :: (load (s8) from %ir.p2 + 3, addrspace 3) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_4:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[DS_READ_U8_gfx9_1]], 8, [[DS_READ_U8_gfx9_]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_5:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[DS_READ_U8_gfx9_3]], 8, [[DS_READ_U8_gfx9_2]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_6:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_5]], 16, [[V_LSHL_OR_B32_e64_4]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE8:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE2]], 8, 0, implicit $exec :: (load (s8) from %ir.p3 + 8, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE9:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE2]], 9, 0, implicit $exec :: (load (s8) from %ir.p3 + 9, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_7:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE9]], 8, [[GLOBAL_LOAD_UBYTE8]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE10:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE2]], 10, 0, implicit $exec :: (load (s8) from %ir.p3 + 10, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE11:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE2]], 11, 0, implicit $exec :: (load (s8) from %ir.p3 + 11, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_8:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE11]], 8, [[GLOBAL_LOAD_UBYTE10]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_9:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_8]], 16, [[V_LSHL_OR_B32_e64_7]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE12:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE2]], 4, 0, implicit $exec :: (load (s8) from %ir.p3 + 4, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE13:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE2]], 5, 0, implicit $exec :: (load (s8) from %ir.p3 + 5, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_10:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE13]], 8, [[GLOBAL_LOAD_UBYTE12]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE14:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE2]], 6, 0, implicit $exec :: (load (s8) from %ir.p3 + 6, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE15:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE2]], 7, 0, implicit $exec :: (load (s8) from %ir.p3 + 7, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_11:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE15]], 8, [[GLOBAL_LOAD_UBYTE14]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_12:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_11]], 16, [[V_LSHL_OR_B32_e64_10]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE16:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE2]], 0, 0, implicit $exec :: (load (s8) from %ir.p3, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE17:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE2]], 1, 0, implicit $exec :: (load (s8) from %ir.p3 + 1, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_13:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE17]], 8, [[GLOBAL_LOAD_UBYTE16]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE18:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE2]], 2, 0, implicit $exec :: (load (s8) from %ir.p3 + 2, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE19:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE2]], 3, 0, implicit $exec :: (load (s8) from %ir.p3 + 3, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_14:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE19]], 8, [[GLOBAL_LOAD_UBYTE18]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_15:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_14]], 16, [[V_LSHL_OR_B32_e64_13]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_LSHL_OR_B32_e64_6]], [[V_LSHL_OR_B32_e64_15]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MUL_LO_U32_e64_]], [[V_LSHL_OR_B32_e64_12]], 0, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[V_ADD_U32_e64_1]], 0, 0, implicit $exec :: (store (s32) into %ir.p3, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD2:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE6]], 0, 0, implicit $exec :: (load (s32) from %ir.p9, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE1]], [[V_LSHL_OR_B32_e64_6]], 0, 0, implicit $exec :: (store (s32) into %ir.p7, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE6]], [[V_MUL_LO_U32_e64_]], 0, 0, implicit $exec :: (store (s32) into %ir.p9, addrspace 1) + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 [[GLOBAL_LOAD_DWORD]], [[COPY]], implicit $exec + ; CHECK-NEXT: SI_SPILL_V64_SAVE [[REG_SEQUENCE6]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V64_SAVE [[REG_SEQUENCE7]], %stack.1, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[GLOBAL_LOAD_DWORD1]], %stack.2, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[GLOBAL_LOAD_UBYTE7]], %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[GLOBAL_LOAD_UBYTE6]], %stack.4, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[GLOBAL_LOAD_UBYTE4]], %stack.5, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[GLOBAL_LOAD_UBYTE5]], %stack.6, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V64_SAVE [[REG_SEQUENCE]], %stack.7, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.7, align 4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V64_SAVE [[REG_SEQUENCE1]], %stack.8, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.8, align 4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[V_LSHL_OR_B32_e64_]], %stack.9, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) + ; CHECK-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF [[V_CMP_GE_U32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.bb1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_16:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_USHORT3]], 16, [[GLOBAL_LOAD_USHORT2]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD3:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE2]], 0, 0, implicit $exec :: (load (s32) from %ir.p3, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE20:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE4]], 8, 0, implicit $exec :: (load (s8) from %ir.p10 + 8, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE21:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE4]], 9, 0, implicit $exec :: (load (s8) from %ir.p10 + 9, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_17:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE21]], 8, [[GLOBAL_LOAD_UBYTE20]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE22:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE4]], 10, 0, implicit $exec :: (load (s8) from %ir.p10 + 10, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE23:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE4]], 11, 0, implicit $exec :: (load (s8) from %ir.p10 + 11, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_18:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE23]], 8, [[GLOBAL_LOAD_UBYTE22]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_19:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_18]], 16, [[V_LSHL_OR_B32_e64_17]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE24:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE4]], 4, 0, implicit $exec :: (load (s8) from %ir.p10 + 4, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE25:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE4]], 5, 0, implicit $exec :: (load (s8) from %ir.p10 + 5, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_20:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE25]], 8, [[GLOBAL_LOAD_UBYTE24]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE26:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE4]], 6, 0, implicit $exec :: (load (s8) from %ir.p10 + 6, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE27:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE4]], 7, 0, implicit $exec :: (load (s8) from %ir.p10 + 7, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_21:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE27]], 8, [[GLOBAL_LOAD_UBYTE26]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_22:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_21]], 16, [[V_LSHL_OR_B32_e64_20]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[GLOBAL_LOAD_DWORD3]], [[V_LSHL_OR_B32_e64_22]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_ADD_U32_e64_2]], [[V_LSHL_OR_B32_e64_19]], implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], [[V_MUL_LO_U32_e64_1]], 0, 0, implicit $exec :: (store (s32) into %ir.p6, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE5]], [[V_ADD_U32_e64_2]], 0, 0, implicit $exec :: (store (s32) into %ir.p8, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE6]], [[V_LSHL_OR_B32_e64_22]], 0, 0, implicit $exec :: (store (s32) into %ir.p9, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE4]], [[GLOBAL_LOAD_DWORD3]], 0, 0, implicit $exec :: (store (s32) into %ir.p10, addrspace 1) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD3]], [[GLOBAL_LOAD_DWORD3]], implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_1:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[GLOBAL_LOAD_DWORD3]], [[V_LSHL_OR_B32_e64_22]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_LSHL_OR_B32_e64_22]], [[V_LSHL_OR_B32_e64_19]], implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_2:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[GLOBAL_LOAD_DWORD3]], [[V_LSHL_OR_B32_e64_19]], 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE10:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_LSHL_OR_B32_e64_]], %subreg.sub0, undef %471:vgpr_32, %subreg.sub1 + ; CHECK-NEXT: %167:vreg_64, $sgpr_null = V_MAD_U64_U32_e64 [[V_MUL_LO_U32_e64_2]], [[GLOBAL_LOAD_DWORD]], [[REG_SEQUENCE10]], 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE11:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_SUB_U32_e64_1]], %subreg.sub0, undef %469:vgpr_32, %subreg.sub1 + ; CHECK-NEXT: %173:vreg_64, $sgpr_null = V_MAD_U64_U32_e64 %167.sub0, [[V_ADD_U32_e64_]], [[REG_SEQUENCE11]], 0, implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_3:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 %173.sub0, [[V_MUL_LO_U32_e64_3]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_4:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 %173.sub0, [[V_SUB_U32_e64_2]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_5:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_MUL_LO_U32_e64_2]], [[V_MUL_LO_U32_e64_3]], implicit $exec + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array2, target-flags(amdgpu-gotprel32-hi) @array2, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: [[COPY22:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] + ; CHECK-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY22]], 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array2, i64 20)`) + ; CHECK-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[FLAT_LOAD_DWORD]], [[V_SUB_U32_e64_2]], 0, implicit $exec + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET1:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array4, target-flags(amdgpu-gotprel32-hi) @array4, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET1]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: [[COPY23:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM1]] + ; CHECK-NEXT: FLAT_STORE_DWORD [[COPY23]], [[V_ADD_U32_e64_3]], 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr getelementptr inbounds nuw (i8, ptr @array4, i64 4)`) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE28:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8) from %ir.p7, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE29:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 1, 0, implicit $exec :: (load (s8) from %ir.p7 + 1, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_23:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE29]], 8, [[GLOBAL_LOAD_UBYTE28]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE30:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 2, 0, implicit $exec :: (load (s8) from %ir.p7 + 2, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE31:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 3, 0, implicit $exec :: (load (s8) from %ir.p7 + 3, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_24:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE31]], 8, [[GLOBAL_LOAD_UBYTE30]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_25:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_24]], 16, [[V_LSHL_OR_B32_e64_23]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE32:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 28, 0, implicit $exec :: (load (s8) from %ir.p7 + 28, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE33:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 29, 0, implicit $exec :: (load (s8) from %ir.p7 + 29, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_26:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE33]], 8, [[GLOBAL_LOAD_UBYTE32]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE34:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 30, 0, implicit $exec :: (load (s8) from %ir.p7 + 30, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE35:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 31, 0, implicit $exec :: (load (s8) from %ir.p7 + 31, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_27:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE35]], 8, [[GLOBAL_LOAD_UBYTE34]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_28:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_27]], 16, [[V_LSHL_OR_B32_e64_26]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE36:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 24, 0, implicit $exec :: (load (s8) from %ir.p7 + 24, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE37:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 25, 0, implicit $exec :: (load (s8) from %ir.p7 + 25, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_29:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE37]], 8, [[GLOBAL_LOAD_UBYTE36]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE38:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 26, 0, implicit $exec :: (load (s8) from %ir.p7 + 26, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE39:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 27, 0, implicit $exec :: (load (s8) from %ir.p7 + 27, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_30:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE39]], 8, [[GLOBAL_LOAD_UBYTE38]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_31:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_30]], 16, [[V_LSHL_OR_B32_e64_29]], implicit $exec + ; CHECK-NEXT: [[FLAT_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY22]], 28, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array2, i64 28)`) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_6:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[FLAT_LOAD_DWORD1]], [[V_MUL_LO_U32_e64_5]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_3]], [[V_MUL_LO_U32_e64_3]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_7:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_SUB_U32_e64_2]], [[FLAT_LOAD_DWORD]], implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_4:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_MUL_LO_U32_e64_7]], [[V_MUL_LO_U32_e64_6]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_8:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_ADD_U32_e64_4]], [[V_SUB_U32_e64_4]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MUL_LO_U32_e64_8]], [[V_LSHL_OR_B32_e64_31]], 0, implicit $exec + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET2:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array3, target-flags(amdgpu-gotprel32-hi) @array3, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET2]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: [[COPY24:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM2]] + ; CHECK-NEXT: FLAT_STORE_DWORD [[COPY24]], [[V_ADD_U32_e64_5]], 68, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr getelementptr inbounds nuw (i8, ptr @array3, i64 68)`) + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET3:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array5, target-flags(amdgpu-gotprel32-hi) @array5, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM3:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET3]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: [[COPY25:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM3]] + ; CHECK-NEXT: [[FLAT_LOAD_DWORD2:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY25]], 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array5, i64 20)`) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_9:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[FLAT_LOAD_DWORD2]], [[V_LSHL_OR_B32_e64_28]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MUL_LO_U32_e64_9]], [[V_SUB_U32_e64_1]], 0, implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_5:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[FLAT_LOAD_DWORD]], [[V_LSHL_OR_B32_e64_28]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_10:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_SUB_U32_e64_2]], [[V_MUL_LO_U32_e64_3]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_6]], [[V_ADD_U32_e64_4]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MUL_LO_U32_e64_9]], [[V_MUL_LO_U32_e64_8]], 0, implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_6:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_LSHL_OR_B32_e64_28]], [[V_SUB_U32_e64_2]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_SUB_U32_e64_5]], [[FLAT_LOAD_DWORD1]], 0, implicit $exec + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET4:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array6, target-flags(amdgpu-gotprel32-hi) @array6, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM4:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET4]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: [[COPY26:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM4]] + ; CHECK-NEXT: [[FLAT_LOAD_DWORD3:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY26]], 44, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array6, i64 44)`) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_11:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[FLAT_LOAD_DWORD3]], [[V_SUB_U32_e64_5]], implicit $exec + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET5:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array7, target-flags(amdgpu-gotprel32-hi) @array7, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM5:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET5]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: [[COPY27:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM5]] + ; CHECK-NEXT: [[FLAT_LOAD_DWORD4:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY27]], 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array7, i64 20)`) + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET6:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array8, target-flags(amdgpu-gotprel32-hi) @array8, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM6:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET6]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: [[COPY28:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM6]] + ; CHECK-NEXT: [[FLAT_LOAD_DWORD5:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY28]], 44, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array8, i64 44)`, align 8) + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET7:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array9, target-flags(amdgpu-gotprel32-hi) @array9, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM7:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET7]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[V_SUB_U32_e64_2]], %stack.10, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.10, addrspace 5) + ; CHECK-NEXT: [[COPY29:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM7]] + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[V_SUB_U32_e64_1]], %stack.11, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.11, addrspace 5) + ; CHECK-NEXT: [[FLAT_LOAD_DWORD6:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY29]], 24, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array9, i64 24)`) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_12:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[FLAT_LOAD_DWORD6]], [[V_ADD_U32_e64_5]], implicit $exec + ; CHECK-NEXT: [[FLAT_LOAD_DWORD7:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY22]], 84, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array2, i64 84)`) + ; CHECK-NEXT: SI_SPILL_V64_SAVE [[REG_SEQUENCE5]], %stack.12, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.12, align 4, addrspace 5) + ; CHECK-NEXT: [[V_ADD_U32_e64_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[FLAT_LOAD_DWORD7]], [[V_MUL_LO_U32_e64_8]], 0, implicit $exec + ; CHECK-NEXT: [[FLAT_LOAD_DWORD8:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY24]], 80, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array3, i64 80)`) + ; CHECK-NEXT: [[FLAT_LOAD_DWORD9:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY23]], 80, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array4, i64 80)`, align 8) + ; CHECK-NEXT: [[FLAT_LOAD_DWORD10:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY25]], 88, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array5, i64 88)`) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[V_LSHL_OR_B32_e64_16]], %stack.13, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.13, addrspace 5) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_13:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[FLAT_LOAD_DWORD10]], [[V_ADD_U32_e64_4]], implicit $exec + ; CHECK-NEXT: [[FLAT_LOAD_DWORD11:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY28]], 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array8, i64 20)`) + ; CHECK-NEXT: [[V_ADD_U32_e64_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[FLAT_LOAD_DWORD11]], [[V_MUL_LO_U32_e64_6]], 0, implicit $exec + ; CHECK-NEXT: [[FLAT_LOAD_DWORD12:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY23]], 8, 0, implicit $exec, implicit $flat_scr :: (dereferenceable load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array4, i64 8)`) + ; CHECK-NEXT: [[FLAT_LOAD_DWORD13:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY22]], 12, 0, implicit $exec, implicit $flat_scr :: (dereferenceable load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array2, i64 12)`) + ; CHECK-NEXT: [[FLAT_LOAD_DWORD14:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY24]], 4, 0, implicit $exec, implicit $flat_scr :: (dereferenceable load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array3, i64 4)`) + ; CHECK-NEXT: [[FLAT_LOAD_DWORD15:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY25]], 4, 0, implicit $exec, implicit $flat_scr :: (dereferenceable load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array5, i64 4)`) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_14:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[FLAT_LOAD_DWORD15]], [[V_MUL_LO_U32_e64_9]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE40:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE9]], 16, 0, implicit $exec :: (load (s8) from %ir.p4 + 16, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE41:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE9]], 17, 0, implicit $exec :: (load (s8) from %ir.p4 + 17, addrspace 1) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[V_LSHL_OR_B32_e64_25]], %stack.14, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.14, addrspace 5) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_32:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE41]], 8, [[GLOBAL_LOAD_UBYTE40]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE42:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE9]], 18, 0, implicit $exec :: (load (s8) from %ir.p4 + 18, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE43:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE9]], 19, 0, implicit $exec :: (load (s8) from %ir.p4 + 19, addrspace 1) + ; CHECK-NEXT: SI_SPILL_V64_SAVE [[COPY26]], %stack.15, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.15, align 4, addrspace 5) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_33:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE43]], 8, [[GLOBAL_LOAD_UBYTE42]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_34:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_33]], 16, [[V_LSHL_OR_B32_e64_32]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE44:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE9]], 12, 0, implicit $exec :: (load (s8) from %ir.p4 + 12, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE45:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE9]], 13, 0, implicit $exec :: (load (s8) from %ir.p4 + 13, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_35:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE45]], 8, [[GLOBAL_LOAD_UBYTE44]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE46:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE9]], 14, 0, implicit $exec :: (load (s8) from %ir.p4 + 14, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE47:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE9]], 15, 0, implicit $exec :: (load (s8) from %ir.p4 + 15, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_36:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE47]], 8, [[GLOBAL_LOAD_UBYTE46]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_37:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_36]], 16, [[V_LSHL_OR_B32_e64_35]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_15:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_LSHL_OR_B32_e64_37]], [[V_SUB_U32_e64_5]], implicit $exec + ; CHECK-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 [[V_LSHL_OR_B32_e64_34]], [[V_ADD_U32_e64_6]], [[V_MUL_LO_U32_e64_15]], implicit $exec + ; CHECK-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[V_MUL_LO_U32_e64_14]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_RCP_IFLAG_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_IFLAG_F32_e64 0, [[V_CVT_F32_U32_e64_]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_MUL_F32_e64 0, 1333788670, 0, [[V_RCP_IFLAG_F32_e64_]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_CVT_U32_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, [[V_MUL_F32_e64_]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_7:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 0, [[V_MUL_LO_U32_e64_14]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_16:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_SUB_U32_e64_7]], [[V_CVT_U32_F32_e64_]], implicit $exec + ; CHECK-NEXT: [[V_MUL_HI_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[V_CVT_U32_F32_e64_]], [[V_MUL_LO_U32_e64_16]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_CVT_U32_F32_e64_]], [[V_MUL_HI_U32_e64_]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_HI_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[V_ADD3_U32_e64_]], [[V_ADD_U32_e64_12]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_17:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_MUL_HI_U32_e64_1]], [[V_MUL_LO_U32_e64_14]], implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_8:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_ADD3_U32_e64_]], [[V_MUL_LO_U32_e64_17]], 0, implicit $exec + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_GE_U32_e64 [[V_SUB_U32_e64_8]], [[V_MUL_LO_U32_e64_14]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 1, [[V_MUL_HI_U32_e64_1]], 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MUL_HI_U32_e64_1]], 0, [[V_ADD_U32_e64_13]], [[V_CMP_GE_U32_e64_1]], implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_9:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_SUB_U32_e64_8]], [[V_MUL_LO_U32_e64_14]], 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_SUB_U32_e64_8]], 0, [[V_SUB_U32_e64_9]], [[V_CMP_GE_U32_e64_1]], implicit $exec + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_2:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_GE_U32_e64 [[V_CNDMASK_B32_e64_1]], [[V_MUL_LO_U32_e64_14]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 1, [[V_CNDMASK_B32_e64_]], 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_CNDMASK_B32_e64_]], 0, [[V_ADD_U32_e64_14]], [[V_CMP_GE_U32_e64_2]], implicit $exec + ; CHECK-NEXT: [[V_ADD3_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 [[FLAT_LOAD_DWORD14]], [[V_ADD_U32_e64_3]], [[V_CNDMASK_B32_e64_2]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_18:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_ADD3_U32_e64_1]], [[FLAT_LOAD_DWORD10]], implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_10:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_MUL_LO_U32_e64_18]], [[FLAT_LOAD_DWORD6]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD3_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 [[FLAT_LOAD_DWORD13]], [[V_ADD_U32_e64_5]], [[V_SUB_U32_e64_10]], implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE12:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_SUB_U32_e64_3]], %subreg.sub0, undef %467:vgpr_32, %subreg.sub1 + ; CHECK-NEXT: %316:vreg_64, $sgpr_null = V_MAD_U64_U32_e64 [[V_ADD3_U32_e64_2]], [[V_MUL_LO_U32_e64_4]], [[REG_SEQUENCE12]], 0, implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_11:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 %316.sub0, [[GLOBAL_LOAD_DWORD2]], 0, implicit $exec + ; CHECK-NEXT: [[V_CVT_F32_U32_e64_1:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[V_LSHL_OR_B32_e64_28]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_RCP_IFLAG_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_IFLAG_F32_e64 0, [[V_CVT_F32_U32_e64_1]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MUL_F32_e64_1:%[0-9]+]]:vgpr_32 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_MUL_F32_e64 0, 1333788670, 0, [[V_RCP_IFLAG_F32_e64_1]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_CVT_U32_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, [[V_MUL_F32_e64_1]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_12:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 0, [[V_LSHL_OR_B32_e64_28]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_19:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_SUB_U32_e64_12]], [[V_CVT_U32_F32_e64_1]], implicit $exec + ; CHECK-NEXT: [[V_MUL_HI_U32_e64_2:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[V_CVT_U32_F32_e64_1]], [[V_MUL_LO_U32_e64_19]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_CVT_U32_F32_e64_1]], [[V_MUL_HI_U32_e64_2]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_HI_U32_e64_3:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[V_SUB_U32_e64_11]], [[V_ADD_U32_e64_15]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_20:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_MUL_HI_U32_e64_3]], [[V_LSHL_OR_B32_e64_28]], implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_13:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_SUB_U32_e64_11]], [[V_MUL_LO_U32_e64_20]], 0, implicit $exec + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_GE_U32_e64 [[V_SUB_U32_e64_13]], [[V_LSHL_OR_B32_e64_28]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 1, [[V_MUL_HI_U32_e64_3]], 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_3:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MUL_HI_U32_e64_3]], 0, [[V_ADD_U32_e64_16]], [[V_CMP_GE_U32_e64_3]], implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_14:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_SUB_U32_e64_13]], [[V_LSHL_OR_B32_e64_28]], 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_4:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_SUB_U32_e64_13]], 0, [[V_SUB_U32_e64_14]], [[V_CMP_GE_U32_e64_3]], implicit $exec + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_4:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_GE_U32_e64 [[V_CNDMASK_B32_e64_4]], [[V_LSHL_OR_B32_e64_28]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 1, [[V_CNDMASK_B32_e64_3]], 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_5:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_CNDMASK_B32_e64_3]], 0, [[V_ADD_U32_e64_17]], [[V_CMP_GE_U32_e64_4]], implicit $exec + ; CHECK-NEXT: DS_WRITE_B8_D16_HI [[COPY19]], [[V_CNDMASK_B32_e64_5]], 2, 0, implicit $exec :: (store (s8) into %ir.p2 + 2, addrspace 3) + ; CHECK-NEXT: DS_WRITE_B8_gfx9 [[COPY19]], [[V_CNDMASK_B32_e64_5]], 0, 0, implicit $exec :: (store (s8) into %ir.p2, addrspace 3) + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 24, [[V_CNDMASK_B32_e64_5]], implicit $exec + ; CHECK-NEXT: DS_WRITE_B8_gfx9 [[COPY19]], [[V_LSHRREV_B32_e64_]], 3, 0, implicit $exec :: (store (s8) into %ir.p2 + 3, addrspace 3) + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 8, [[V_CNDMASK_B32_e64_5]], implicit $exec + ; CHECK-NEXT: DS_WRITE_B8_gfx9 [[COPY19]], [[V_LSHRREV_B32_e64_1]], 1, 0, implicit $exec :: (store (s8) into %ir.p2 + 1, addrspace 3) + ; CHECK-NEXT: [[V_SUB_U32_e64_15:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_CNDMASK_B32_e64_5]], [[FLAT_LOAD_DWORD11]], 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE13:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[FLAT_LOAD_DWORD5]], %subreg.sub0, undef %465:vgpr_32, %subreg.sub1 + ; CHECK-NEXT: %346:vreg_64, $sgpr_null = V_MAD_U64_U32_e64 [[V_SUB_U32_e64_15]], [[V_LSHL_OR_B32_e64_34]], [[REG_SEQUENCE13]], 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE14:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[FLAT_LOAD_DWORD]], %subreg.sub0, undef %463:vgpr_32, %subreg.sub1 + ; CHECK-NEXT: %352:vreg_64, $sgpr_null = V_MAD_U64_U32_e64 %346.sub0, [[FLAT_LOAD_DWORD1]], [[REG_SEQUENCE14]], 0, implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_16:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 %352.sub0, [[V_MUL_LO_U32_e64_5]], 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE15:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_SUB_U32_e64_16]], %subreg.sub0, undef %461:vgpr_32, %subreg.sub1 + ; CHECK-NEXT: %359:vreg_64, $sgpr_null = V_MAD_U64_U32_e64 [[FLAT_LOAD_DWORD12]], [[V_ADD_U32_e64_3]], [[REG_SEQUENCE15]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_21:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 %359.sub0, [[V_ADD_U32_e64_11]], implicit $exec + ; CHECK-NEXT: [[V_CVT_F32_U32_e64_2:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[V_MUL_LO_U32_e64_13]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_RCP_IFLAG_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_IFLAG_F32_e64 0, [[V_CVT_F32_U32_e64_2]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MUL_F32_e64_2:%[0-9]+]]:vgpr_32 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_MUL_F32_e64 0, 1333788670, 0, [[V_RCP_IFLAG_F32_e64_2]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_CVT_U32_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, [[V_MUL_F32_e64_2]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_17:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 0, [[V_MUL_LO_U32_e64_13]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_22:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_SUB_U32_e64_17]], [[V_CVT_U32_F32_e64_2]], implicit $exec + ; CHECK-NEXT: [[V_MUL_HI_U32_e64_4:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[V_CVT_U32_F32_e64_2]], [[V_MUL_LO_U32_e64_22]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_CVT_U32_F32_e64_2]], [[V_MUL_HI_U32_e64_4]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_HI_U32_e64_5:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[V_MUL_LO_U32_e64_21]], [[V_ADD_U32_e64_18]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_23:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_MUL_HI_U32_e64_5]], [[V_MUL_LO_U32_e64_13]], implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_18:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_MUL_LO_U32_e64_21]], [[V_MUL_LO_U32_e64_23]], 0, implicit $exec + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_5:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_GE_U32_e64 [[V_SUB_U32_e64_18]], [[V_MUL_LO_U32_e64_13]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 1, [[V_MUL_HI_U32_e64_5]], 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_6:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MUL_HI_U32_e64_5]], 0, [[V_ADD_U32_e64_19]], [[V_CMP_GE_U32_e64_5]], implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_19:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_SUB_U32_e64_18]], [[V_MUL_LO_U32_e64_13]], 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_7:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_SUB_U32_e64_18]], 0, [[V_SUB_U32_e64_19]], [[V_CMP_GE_U32_e64_5]], implicit $exec + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_6:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_GE_U32_e64 [[V_CNDMASK_B32_e64_7]], [[V_MUL_LO_U32_e64_13]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 1, [[V_CNDMASK_B32_e64_6]], 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_8:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_CNDMASK_B32_e64_6]], 0, [[V_ADD_U32_e64_20]], [[V_CMP_GE_U32_e64_6]], implicit $exec + ; CHECK-NEXT: [[V_ADD3_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 [[FLAT_LOAD_DWORD9]], [[V_MUL_LO_U32_e64_7]], [[V_CNDMASK_B32_e64_8]], implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_20:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_SUB_U32_e64_4]], [[FLAT_LOAD_DWORD8]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD3_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 [[V_ADD3_U32_e64_3]], [[V_SUB_U32_e64_20]], [[V_ADD_U32_e64_10]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_24:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_ADD3_U32_e64_4]], [[V_MUL_LO_U32_e64_12]], implicit $exec + ; CHECK-NEXT: [[V_ADD3_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 [[FLAT_LOAD_DWORD5]], [[V_MUL_LO_U32_e64_9]], [[V_MUL_LO_U32_e64_24]], implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_21:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_ADD_U32_e64_6]], [[FLAT_LOAD_DWORD4]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD3_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 [[V_ADD3_U32_e64_5]], [[V_SUB_U32_e64_21]], [[V_MUL_LO_U32_e64_11]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_25:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_ADD3_U32_e64_6]], [[V_ADD_U32_e64_3]], implicit $exec + ; CHECK-NEXT: FLAT_STORE_DWORD [[COPY27]], [[V_MUL_LO_U32_e64_25]], 68, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr getelementptr inbounds nuw (i8, ptr @array7, i64 68)`) + ; CHECK-NEXT: [[V_ADD3_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 [[FLAT_LOAD_DWORD]], [[V_ADD_U32_e64_5]], [[V_MUL_LO_U32_e64_25]], implicit $exec + ; CHECK-NEXT: [[V_CVT_F32_U32_e64_3:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[V_ADD_U32_e64_9]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_RCP_IFLAG_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_IFLAG_F32_e64 0, [[V_CVT_F32_U32_e64_3]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MUL_F32_e64_3:%[0-9]+]]:vgpr_32 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_MUL_F32_e64 0, 1333788670, 0, [[V_RCP_IFLAG_F32_e64_3]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_CVT_U32_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, [[V_MUL_F32_e64_3]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_22:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 0, [[V_ADD_U32_e64_9]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_26:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_SUB_U32_e64_22]], [[V_CVT_U32_F32_e64_3]], implicit $exec + ; CHECK-NEXT: [[V_MUL_HI_U32_e64_6:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[V_CVT_U32_F32_e64_3]], [[V_MUL_LO_U32_e64_26]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_CVT_U32_F32_e64_3]], [[V_MUL_HI_U32_e64_6]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_HI_U32_e64_7:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[V_ADD3_U32_e64_7]], [[V_ADD_U32_e64_21]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_27:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_MUL_HI_U32_e64_7]], [[V_ADD_U32_e64_9]], implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_23:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_ADD3_U32_e64_7]], [[V_MUL_LO_U32_e64_27]], 0, implicit $exec + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_7:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_GE_U32_e64 [[V_SUB_U32_e64_23]], [[V_ADD_U32_e64_9]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 1, [[V_MUL_HI_U32_e64_7]], 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_9:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MUL_HI_U32_e64_7]], 0, [[V_ADD_U32_e64_22]], [[V_CMP_GE_U32_e64_7]], implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_24:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_SUB_U32_e64_23]], [[V_ADD_U32_e64_9]], 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_10:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_SUB_U32_e64_23]], 0, [[V_SUB_U32_e64_24]], [[V_CMP_GE_U32_e64_7]], implicit $exec + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_8:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_GE_U32_e64 [[V_CNDMASK_B32_e64_10]], [[V_ADD_U32_e64_9]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 1, [[V_CNDMASK_B32_e64_9]], 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_11:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_CNDMASK_B32_e64_9]], 0, [[V_ADD_U32_e64_23]], [[V_CMP_GE_U32_e64_8]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_28:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_CNDMASK_B32_e64_11]], [[V_SUB_U32_e64_6]], implicit $exec + ; CHECK-NEXT: [[V_ADD3_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 [[V_MUL_LO_U32_e64_28]], [[V_ADD_U32_e64_8]], [[V_ADD_U32_e64_7]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_29:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_ADD3_U32_e64_8]], [[V_MUL_LO_U32_e64_10]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_30:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_MUL_LO_U32_e64_29]], [[V_LSHL_OR_B32_e64_3]], implicit $exec + ; CHECK-NEXT: [[FLAT_LOAD_DWORD16:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY24]], 84, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array3, i64 84)`) + ; CHECK-NEXT: [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[FLAT_LOAD_DWORD16]], [[V_LSHL_OR_B32_e64_34]], 0, implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_25:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_ADD_U32_e64_24]], [[V_MUL_LO_U32_e64_5]], 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE16:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MUL_LO_U32_e64_30]], %subreg.sub0, [[V_SUB_U32_e64_25]], %subreg.sub1 + ; CHECK-NEXT: FLAT_STORE_DWORDX2 [[COPY23]], [[REG_SEQUENCE16]], 76, 0, implicit $exec, implicit $flat_scr :: (store (s64) into `ptr getelementptr inbounds nuw (i8, ptr @array4, i64 76)`, align 4) + ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.15, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.15, align 4, addrspace 5) + ; CHECK-NEXT: [[FLAT_LOAD_DWORD17:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[SI_SPILL_V64_RESTORE]], 28, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array6, i64 28)`) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.14, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.14, addrspace 5) + ; CHECK-NEXT: [[V_ADD_U32_e64_25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[FLAT_LOAD_DWORD17]], [[SI_SPILL_V32_RESTORE]], 0, implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE1:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.13, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.13, addrspace 5) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_31:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_ADD_U32_e64_25]], [[SI_SPILL_V32_RESTORE1]], implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE1:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.12, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.12, align 4, addrspace 5) + ; CHECK-NEXT: GLOBAL_STORE_SHORT_D16_HI [[SI_SPILL_V64_RESTORE1]], [[V_MUL_LO_U32_e64_31]], 2, 0, implicit $exec :: (store (s16) into %ir.p8 + 2, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_SHORT [[SI_SPILL_V64_RESTORE1]], [[V_MUL_LO_U32_e64_31]], 0, 0, implicit $exec :: (store (s16) into %ir.p8, addrspace 1) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE2:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.10, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.10, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE3:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.11, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.11, addrspace 5) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.bb2: + ; CHECK-NEXT: successors: %bb.5(0x40000000), %bb.3(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_LSHL_OR_B32_e64_6]], %bb.0, [[SI_SPILL_V32_RESTORE3]], %bb.1 + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[V_LSHL_OR_B32_e64_9]], %bb.0, [[FLAT_LOAD_DWORD16]], %bb.1 + ; CHECK-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[V_ADD_U32_e64_1]], %bb.0, [[SI_SPILL_V32_RESTORE2]], %bb.1 + ; CHECK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE4:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) + ; CHECK-NEXT: [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GT_U32_e64 [[SI_SPILL_V32_RESTORE4]], [[PHI1]], implicit $exec + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE2:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.7, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.7, align 4, addrspace 5) + ; CHECK-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF [[V_CMP_GT_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3.Flow: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.6(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI3:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.2, %482, %bb.5 + ; CHECK-NEXT: [[PHI4:%[0-9]+]]:vgpr_32 = PHI [[PHI]], %bb.2, undef %488:vgpr_32, %bb.5 + ; CHECK-NEXT: [[PHI5:%[0-9]+]]:vreg_64 = PHI [[SI_SPILL_V64_RESTORE2]], %bb.2, undef %490:vreg_64, %bb.5 + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE5:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE6:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_38:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[SI_SPILL_V32_RESTORE6]], 8, [[SI_SPILL_V32_RESTORE5]], implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE7:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE8:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_39:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[SI_SPILL_V32_RESTORE7]], 8, [[SI_SPILL_V32_RESTORE8]], implicit $exec + ; CHECK-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE [[SI_IF1]], %bb.6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4.bb7: + ; CHECK-NEXT: successors: %bb.6(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_ADD_U32_e64_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI4]], [[V_LSHL_OR_B32_e64_3]], 0, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[PHI5]], [[V_ADD_U32_e64_26]], 0, 0, implicit $exec :: (store (s32) into %ir.p6, addrspace 1) + ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI3]], $exec_lo, implicit-def dead $scc + ; CHECK-NEXT: S_BRANCH %bb.6 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5.bb8: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_ADD_U32_e64_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI2]], [[SI_SPILL_V32_RESTORE4]], 0, implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE3:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.8, align 4, addrspace 5) + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[SI_SPILL_V64_RESTORE3]], [[V_ADD_U32_e64_27]], 0, 0, implicit $exec :: (store (s32) into %ir.p7, addrspace 1) + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_GE_U32_e64_]], $exec_lo, implicit-def dead $scc + ; CHECK-NEXT: [[COPY30:%[0-9]+]]:sreg_32 = COPY [[S_AND_B32_]] + ; CHECK-NEXT: S_BRANCH %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.6.Flow1: + ; CHECK-NEXT: successors: %bb.7(0x40000000), %bb.8(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI6:%[0-9]+]]:sreg_32 = PHI [[PHI3]], %bb.3, [[S_OR_B32_]], %bb.4 + ; CHECK-NEXT: SI_END_CF [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_40:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_39]], 16, [[V_LSHL_OR_B32_e64_38]], implicit $exec + ; CHECK-NEXT: [[SI_IF2:%[0-9]+]]:sreg_32 = SI_IF [[PHI6]], %bb.8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.7.bb9: + ; CHECK-NEXT: successors: %bb.8(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE9:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; CHECK-NEXT: [[V_ADD_U32_e64_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[SI_SPILL_V32_RESTORE9]], [[V_LSHL_OR_B32_e64_3]], 0, implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE4:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[SI_SPILL_V64_RESTORE4]], [[V_ADD_U32_e64_28]], 0, 0, implicit $exec :: (store (s32) into %ir.p1, addrspace 1) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.8.bb10: + ; CHECK-NEXT: SI_END_CF [[SI_IF2]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_LSHL_OR_B32_e64_40]], [[GLOBAL_LOAD_DWORD2]], 0, implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE5:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[SI_SPILL_V64_RESTORE5]], [[V_ADD_U32_e64_29]], 0, 0, implicit $exec :: (store (s32) into %ir.p9, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 +entry: + %ld = load i32, ptr addrspace(1) %p4, align 2 + %ld0 = load i32, ptr addrspace(1) %p5, align 8 + %ld1 = load i32, ptr addrspace(1) %p1, align 4 + %ld2 = load i32, ptr addrspace(1) %p9, align 1 + %ld8 = load i32, ptr addrspace(1) %p8, align 2 + %ld6 = load i32, ptr addrspace(1) %p6, align 4 + %ld10 = load i32, ptr addrspace(1) %p10, align 1 + %ld11 = load i32, ptr addrspace(1) %p11, align 1 + %tmp0 = sub i32 %ld, %ld0 + store i32 %tmp0, ptr addrspace(1) %p11 + %tmp1 = add i32 %ld, %ld0 + store i32 %tmp1, ptr addrspace(1) %p1 + %ld3 = load i32, ptr addrspace(3) %p2, align 1 + %load4 = load <8 x i32>, ptr addrspace(1) %p3, align 1 + %extract11 = extractelement < 8 x i32> %load4, i32 0 + %extract12 = extractelement < 8 x i32> %load4, i32 1 + %extract13 = extractelement < 8 x i32> %load4, i32 2 + %extract14 = extractelement < 8 x i32> %load4, i32 3 + %extract15 = extractelement < 8 x i32> %load4, i32 4 + %extract16 = extractelement < 8 x i32> %load4, i32 5 + %extract17 = extractelement < 8 x i32> %load4, i32 6 + %extract18 = extractelement < 8 x i32> %load4, i32 7 + %tmp70 = mul i32 %ld3, %extract11 + %tmp71 = add i32 %tmp70, %extract12 + %tmp72 = sub i32 %tmp71, %ld0 + store i32 %tmp71, ptr addrspace(1) %p3 + %ld9 = load i32, ptr addrspace(1) %p9 + store i32 %ld3, ptr addrspace(1) %p7 + store i32 %tmp70, ptr addrspace(1) %p9 + %cond1 = icmp uge i32 %ld0, %arg1 + br i1 %cond1, label %bb1, label %bb2 + +bb1: + %load1 = load i32, ptr addrspace(1) %p3, align 4 + %load2 = load <8 x i32>, ptr addrspace(1) %p10, align 1 + %extract1 = extractelement < 8 x i32> %load2, i32 1 + %extract2 = extractelement < 8 x i32> %load2, i32 2 + %tmp84 = add i32 %load1, %extract1 + %tmp85 = mul i32 %tmp84, %extract2 + store i32 %tmp85, ptr addrspace(1) %p6 + store i32 %tmp84, ptr addrspace(1) %p8 + store i32 %extract1, ptr addrspace(1) %p9 + store i32 %load1, ptr addrspace(1) %p10 + %tmp101 = mul i32 %load1, %load1 + %tmp102 = sub i32 %load1, %extract1 + %tmp103 = mul i32 %extract1, %extract2 + %tmp104 = sub i32 %load1, %extract2 + %tmp73 = mul i32 %tmp101, %ld0 + %tmp74 = add i32 %tmp73, %ld + %tmp75 = mul i32 %tmp74, %tmp1 + %tmp76 = add i32 %tmp75, %tmp102 + %tmp77 = sub i32 %tmp76, %tmp103 + %tmp78 = mul i32 %tmp76, %tmp104 + %tmp2 = mul i32 %tmp101, %tmp103 + %idx10 = getelementptr inbounds [5 x i32], [5 x i32]* @array2, i64 1, i64 0 + %val0 = load i32, i32* %idx10, align 4 + %tmp3 = add i32 %val0, %tmp104 + %idx20 = getelementptr inbounds [5 x i32], [5 x i32]* @array4, i64 0, i64 1 + store i32 %tmp3, i32 *%idx20 + %load22 = load <8 x i32>, ptr addrspace(1) %p7, align 1 + %extract3 = extractelement < 8 x i32> %load22, i32 6 + %idx12 = getelementptr inbounds [5 x i32], [5 x i32]* @array2, i64 1, i64 2 + %val2 = load i32, i32* %idx12, align 4 + %tmp4 = mul i32 %val2, %tmp2 + %tmp5= add i32 %tmp3, %tmp103 + %tmp6 = mul i32 %tmp104, %val0 + %tmp7 = sub i32 %tmp6, %tmp4 + %tmp8 = mul i32 %tmp5, %tmp7 + %tmp9 = add i32 %tmp8, %extract3 + %idx22 = getelementptr inbounds [5 x i32], [5 x i32]* @array3, i64 3, i64 2 + store i32 %tmp9, i32 *%idx22 + %extract4 = extractelement < 8 x i32> %load22, i32 7 + %idx13 = getelementptr inbounds [5 x i32], [5 x i32]* @array5, i64 1, i64 0 + %val3 = load i32, i32* %idx13, align 4 + %tmp10 = mul i32 %val3, %extract4 + %tmp11 = add i32 %tmp10, %tmp102 + %tmp12 = sub i32 %val0, %extract4 + %tmp13 = mul i32 %tmp104, %tmp103 + %tmp14 = add i32 %tmp11, %tmp5 + %tmp15 = add i32 %tmp10, %tmp8 + %tmp16 = sub i32 %extract4, %tmp104 + %tmp17 = add i32 %tmp12, %val2 + %tmp18 = add i32 %val0, %tmp9 + %idx601 = getelementptr inbounds [5 x i32], [5 x i32]* @array6, i64 2, i64 1 + %val601 = load i32, i32* %idx601, align 1 + %tmp19 = mul i32 %val601, %tmp12 + %idx701 = getelementptr inbounds [5 x i32], [5 x i32]* @array7, i64 1, i64 0 + %val701 = load i32, i32* %idx701, align 2 + %tmp20 = sub i32 %val701, %tmp11 + %idx801 = getelementptr inbounds [5 x i32], [5 x i32]* @array8, i64 2, i64 1 + %val801 = load i32, i32* %idx801, align 8 + %tmp21 = add i32 %val801, %tmp10 + %idx901 = getelementptr inbounds [5 x i32], [5 x i32]* @array9, i64 1, i64 1 + %val901 = load i32, i32* %idx901, align 1 + %tmp22 = mul i32 %val901, %tmp9 + %idx602 = getelementptr inbounds [5 x i32], [5 x i32]* @array2, i64 4, i64 1 + %val602 = load i32, i32* %idx602, align 1 + %tmp23 = add i32 %val602, %tmp8 + %idx702 = getelementptr inbounds [5 x i32], [5 x i32]* @array3, i64 4, i64 0 + %val702 = load i32, i32* %idx702, align 2 + %tmp24 = sub i32 %val702, %tmp7 + %idx802 = getelementptr inbounds [5 x i32], [5 x i32]* @array4, i64 4, i64 0 + %val802 = load i32, i32* %idx802, align 8 + %tmp25 = add i32 %val802, %tmp6 + %idx902 = getelementptr inbounds [5 x i32], [5 x i32]* @array5, i64 4, i64 2 + %val902 = load i32, i32* %idx902, align 1 + %tmp26 = mul i32 %val902, %tmp5 + %idx800 = getelementptr inbounds [5 x i32], [5 x i32]* @array8, i64 1, i64 0 + %val800 = load i32, i32* %idx800, align 4 + %tmp27 = add i32 %val800, %tmp4 + %idx15 = getelementptr inbounds [5 x i32], [5 x i32]* @array4, i64 0, i64 2 + %val5 = load i32, i32* %idx15, align 4 + %tmp28 = mul i32 %val5, %tmp3 + %idx16 = getelementptr inbounds [5 x i32], [5 x i32]* @array2, i64 0, i64 3 + %val6 = load i32, i32* %idx16, align 4 + %tmp206 = add i32 %val6, %tmp9 + %idx17 = getelementptr inbounds [5 x i32], [5 x i32]* @array3, i64 0, i64 1 + %val7 = load i32, i32* %idx17, align 4 + %tmp207 = add i32 %val7, %tmp3 + %idx18 = getelementptr inbounds [5 x i32], [5 x i32]* @array5, i64 0, i64 1 + %val8 = load i32, i32* %idx18, align 4 + %tmp208 = mul i32 %val8, %tmp10 + %load3 = load <8 x i32>, ptr addrspace(1) %p4, align 1 + %extract7 = extractelement < 8 x i32> %load3, i32 4 + %tmp209 = add i32 %extract7, %tmp11 + %extract8 = extractelement < 8 x i32> %load3, i32 3 + %tmp30 = mul i32 %extract8, %tmp12 + %tmp31 = add i32 %tmp30, %tmp209 + %tmp32 = udiv i32 %tmp31, %tmp208 + %tmp33 = add i32 %tmp32, %tmp207 + %tmp34 = mul i32 %tmp33, %val902 + %tmp35 = sub i32 %tmp34, %val901 + %tmp36 = add i32 %tmp35, %tmp206 + %tmp37 = mul i32 %tmp36, %tmp78 + %tmp38 = add i32 %tmp37, %tmp77 + %tmp39 = sub i32 %tmp38, %ld9 + %tmp40 = udiv i32 %tmp39, %extract4 + store i32 %tmp40, ptr addrspace(3) %p2, align 1 + %tmp41 = sub i32 %tmp40, %val800 + %tmp42 = mul i32 %tmp41, %extract7 + %tmp43 = add i32 %tmp42, %val801 + %tmp44 = mul i32 %tmp43, %val2 + %tmp45 = add i32 %tmp44, %val0 + %tmp46 = sub i32 %tmp45, %tmp2 + %tmp47 = add i32 %tmp46, %tmp28 + %tmp48 = mul i32 %tmp47, %tmp27 + %tmp49 = udiv i32 %tmp48, %tmp26 + %tmp50 = add i32 %tmp49, %tmp25 + %tmp51 = sub i32 %tmp50, %tmp24 + %tmp52 = add i32 %tmp51, %tmp23 + %tmp53 = mul i32 %tmp52, %tmp22 + %tmp54 = add i32 %tmp53, %tmp21 + %tmp55 = sub i32 %tmp54, %tmp20 + %tmp56 = add i32 %tmp55, %tmp19 + %tmp57 = mul i32 %tmp56, %tmp3 + %idx700 = getelementptr inbounds [5 x i32], [5 x i32]* @array7, i64 3, i64 2 + store i32 %tmp57, i32 *%idx700 + %tmp58 = add i32 %tmp57, %tmp18 + %tmp59 = udiv i32 %tmp58, %tmp17 + %tmp60 = mul i32 %tmp59, %tmp16 + %tmp61 = add i32 %tmp60, %tmp15 + %tmp62 = add i32 %tmp61, %tmp14 + %tmp63 = mul i32 %tmp62, %tmp13 + %tmp64 = mul i32 %tmp63, %ld2 + %idx23 = getelementptr inbounds [5 x i32], [5 x i32]* @array4, i64 3, i64 4 + store i32 %tmp64, i32 *%idx23 + %extract27 = extractelement < 8 x i32> %load3, i32 4 + %idx14 = getelementptr inbounds [5 x i32], [5 x i32]* @array3, i64 4, i64 1 + %val4 = load i32, i32* %idx14, align 4 + %tmp65 = add i32 %val4, %extract27 + %tmp66 = sub i32 %tmp65, %tmp2 + %idx24 = getelementptr inbounds [5 x i32], [5 x i32]* @array4, i64 3, i64 5 + store i32 %tmp66, i32 *%idx24 + %extract9 = extractelement < 8 x i32> %load22, i32 0 + %idx600 = getelementptr inbounds [5 x i32], [5 x i32]* @array6, i64 1, i64 2 + %val600 = load i32, i32* %idx600, align 4 + %tmp67 = add i32 %val600, %extract9 + %extract10 = extractelement < 8 x i32> %load22, i32 5 + %tmp68 = sub i32 %extract10, %tmp3 + %tmp69 = add i32 %ld10, %ld6 + %tmp79 = mul i32 %tmp67, %ld8 + store i32 %tmp79, ptr addrspace(1) %p8, align 2 + br label %bb2 + +bb2: + %phi1 = phi i32 [ %load1, %bb1 ], [ %tmp72, %entry ] + %phi2 = phi i32 [ %tmp102, %bb1], [ %ld3, %entry ] + %phi3 = phi i32 [ %val4, %bb1 ], [ %extract13, %entry ] + %phi4 = phi i32 [ %tmp104, %bb1 ], [ %tmp71, %entry ] + %tmp105 = add i32 %phi1, %phi2 + %tmp106 = add i32 %ld8, %phi4 + %tmp107 = mul i32 %tmp105, %tmp106 + %tmp108 = sub i32 %tmp107, %ld6 + %tmp80 = mul i32 %tmp108, %ld2 + %cond3 = icmp ule i32 %ld, %phi3 + br i1 %cond3, label %bb7, label %bb8 + +bb7: + %tmp81 = add i32 %phi2, %ld2 + store i32 %tmp81, ptr addrspace(1) %p6 + br label %bb9 + +bb8: + %tmp82 = add i32 %phi4, %ld + store i32 %tmp82, ptr addrspace(1) %p7 + %xor = xor i1 %cond1, %cond3 + br i1 %xor, label %bb9, label %bb10 + +bb9: + %phi5 = phi i32 [ %tmp81, %bb7], [%tmp82, %bb8] + %tmp83 = add i32 %ld1, %ld2 + store i32 %tmp83, ptr addrspace(1) %p1 + br label %bb10 + +bb10: + %tmp90 = add i32 %ld10, %ld9 + store i32 %tmp90, ptr addrspace(1) %p9, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/test_ers_multiple_spills3.ll b/llvm/test/CodeGen/AMDGPU/test_ers_multiple_spills3.ll new file mode 100644 index 0000000000000..3e75d06805c97 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/test_ers_multiple_spills3.ll @@ -0,0 +1,841 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -stop-after=amdgpu-early-register-spilling -verify-machineinstrs -print-after=amdgpu-early-register-spilling -max-vgprs=30 < %s 2>&1 | FileCheck %s + +@array2 = global [5 x i32] zeroinitializer, align 4 +@array3 = global [5 x i32] zeroinitializer, align 4 +@array4 = global [5 x i32] zeroinitializer, align 4 +@array5 = global [5 x i32] zeroinitializer, align 4 + +@array6 = global [5 x i32] zeroinitializer, align 4 +@array7 = global [5 x i32] zeroinitializer, align 4 +@array8 = global [5 x i32] zeroinitializer, align 4 +@array9 = global [5 x i32] zeroinitializer, align 4 + +; bb.0.entry +; / | +; bb.1.bb1 | +; \ | +; bb.2.bb2 +; | \ +; | bb.3.bb4.preheader +; | | +; | bb.19<-+ +; | +----+ +; | | +; | bb.20.bb14.loopexit +; | / +; bb.18.Flow17 +; / | +; bb.4.bb3 | +; / | | +; bb.10.bb7 | | +; \ | | +; bb.5.Flow16 | +; / | | +; bb.6.bb6 | | +; / | | | +; bb.9.bb9 | | | +; \ | | | +; bb.7.Flow14 | | +; / | | | +; bb.8.bb8 | | | +; \ | | | +; bb.11.Flow15 | | +; \ | | +; bb.13.bb10 | +; / | | +; bb.16.bb12 | | +; \ | | +; bb.14.Flow | +; / | | +; bb.15.bb11 | | +; \ | | +; bb.17.bb13 | +; \ | +; bb.12.Flow18 +; | +; bb.21.bb14 +; +define amdgpu_ps void @test15(ptr addrspace(1) %p1, ptr addrspace(3) %p2, i1 %cond1, i1 %cond2, ptr addrspace(1) %p3, ptr addrspace(1) %p4, ptr addrspace(1) %p5, ptr addrspace(1) %p6, ptr addrspace(1) %p7, ptr addrspace(1) %p8, ptr addrspace(1) %p9, i32 %TC1) { + ; CHECK-LABEL: name: test15 + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr19 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr18 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr17 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr13 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr12 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY $vgpr7 + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY $vgpr6 + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY13]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY19]], %subreg.sub0, [[COPY18]], %subreg.sub1 + ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 1, [[COPY16]], implicit $exec + ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 1, [[V_AND_B32_e64_]], implicit $exec + ; CHECK-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 1, [[COPY15]], implicit $exec + ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 1, [[V_AND_B32_e64_1]], implicit $exec + ; CHECK-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_1]], -1, implicit-def dead $scc + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8) from %ir.p1, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 1, 0, implicit $exec :: (load (s8) from %ir.p1 + 1, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE1]], 8, [[GLOBAL_LOAD_UBYTE]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE2:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 2, 0, implicit $exec :: (load (s8) from %ir.p1 + 2, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE3:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 3, 0, implicit $exec :: (load (s8) from %ir.p1 + 3, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE3]], 8, [[GLOBAL_LOAD_UBYTE2]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_1]], 16, [[V_LSHL_OR_B32_e64_]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE4:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 12, 0, implicit $exec :: (load (s8) from %ir.gep1, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE5:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 13, 0, implicit $exec :: (load (s8) from %ir.gep1 + 1, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE6:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 14, 0, implicit $exec :: (load (s8) from %ir.gep1 + 2, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE7:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 15, 0, implicit $exec :: (load (s8) from %ir.gep1 + 3, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE]], 16, 0, implicit $exec :: (load (s128) from %ir.p3 + 16, align 4, addrspace 1) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[COPY4]], %stack.2, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_1:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s128) from %ir.p3, align 4, addrspace 1) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[COPY3]], %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) + ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[GLOBAL_LOAD_DWORDX4_1]].sub0, [[V_LSHL_OR_B32_e64_2]], 0, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], [[V_ADD_U32_e64_]], 0, 0, implicit $exec :: (store (s32) into %ir.p3, addrspace 1) + ; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], [[V_LSHL_OR_B32_e64_2]], 0, implicit $exec + ; CHECK-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.bb1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 100, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.bb2: + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.18(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_ADD_U32_e64_1]], %bb.0, [[V_MOV_B32_e32_]], %bb.1 + ; CHECK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF [[S_XOR_B32_]], %bb.18, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3.bb4.preheader: + ; CHECK-NEXT: successors: %bb.19(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array6, target-flags(amdgpu-gotprel32-hi) @array6, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] + ; CHECK-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY20]], 28, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array6, i64 28)`) + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET1:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array8, target-flags(amdgpu-gotprel32-hi) @array8, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET1]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: [[COPY21:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM1]] + ; CHECK-NEXT: [[FLAT_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY21]], 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array8, i64 20)`) + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: S_BRANCH %bb.19 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4.bb3: + ; CHECK-NEXT: successors: %bb.10(0x40000000), %bb.5(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_3:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 %523, 8, %521, implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_4:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 %527, 8, %525, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE %513, %subreg.sub0, %515, %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE %505, %subreg.sub0, %507, %subreg.sub1 + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_5:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_4]], 16, [[V_LSHL_OR_B32_e64_3]], implicit $exec + ; CHECK-NEXT: [[DS_READ_U8_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 %503, 0, 0, implicit $exec :: (load (s8) from %ir.p2, addrspace 3) + ; CHECK-NEXT: [[DS_READ_U8_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 %503, 1, 0, implicit $exec :: (load (s8) from %ir.p2 + 1, addrspace 3) + ; CHECK-NEXT: [[DS_READ_U8_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 %503, 2, 0, implicit $exec :: (load (s8) from %ir.p2 + 2, addrspace 3) + ; CHECK-NEXT: [[DS_READ_U8_gfx9_3:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 %503, 3, 0, implicit $exec :: (load (s8) from %ir.p2 + 3, addrspace 3) + ; CHECK-NEXT: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 %503, 12, 0, implicit $exec :: (load (s32) from %ir.gep2, align 8, addrspace 3) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_6:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[DS_READ_U8_gfx9_1]], 8, [[DS_READ_U8_gfx9_]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_7:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[DS_READ_U8_gfx9_3]], 8, [[DS_READ_U8_gfx9_2]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_8:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_7]], 16, [[V_LSHL_OR_B32_e64_6]], implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.13, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.13, addrspace 5) + ; CHECK-NEXT: %231:vreg_64, $sgpr_null = V_MAD_U64_U32_e64 [[SI_SPILL_V32_RESTORE]], [[PHI]], 1900, 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[DS_READ_B32_gfx9_]], %231.sub0, 0, implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE8:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE3]], 20, 0, implicit $exec :: (load (s8) from %ir.p4 + 20, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE9:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE3]], 21, 0, implicit $exec :: (load (s8) from %ir.p4 + 21, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_9:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE9]], 8, [[GLOBAL_LOAD_UBYTE8]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE10:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE3]], 22, 0, implicit $exec :: (load (s8) from %ir.p4 + 22, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE11:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE3]], 23, 0, implicit $exec :: (load (s8) from %ir.p4 + 23, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_10:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE11]], 8, [[GLOBAL_LOAD_UBYTE10]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE12:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE3]], 12, 0, implicit $exec :: (load (s8) from %ir.p4 + 12, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE13:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE3]], 13, 0, implicit $exec :: (load (s8) from %ir.p4 + 13, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_11:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE13]], 8, [[GLOBAL_LOAD_UBYTE12]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE14:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE3]], 14, 0, implicit $exec :: (load (s8) from %ir.p4 + 14, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE15:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE3]], 15, 0, implicit $exec :: (load (s8) from %ir.p4 + 15, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_12:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE15]], 8, [[GLOBAL_LOAD_UBYTE14]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE16:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE3]], 8, 0, implicit $exec :: (load (s8) from %ir.p4 + 8, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE17:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE3]], 9, 0, implicit $exec :: (load (s8) from %ir.p4 + 9, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_13:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE17]], 8, [[GLOBAL_LOAD_UBYTE16]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE18:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE3]], 10, 0, implicit $exec :: (load (s8) from %ir.p4 + 10, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE19:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE3]], 11, 0, implicit $exec :: (load (s8) from %ir.p4 + 11, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_14:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE19]], 8, [[GLOBAL_LOAD_UBYTE18]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE20:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE3]], 4, 0, implicit $exec :: (load (s8) from %ir.p4 + 4, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE21:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE3]], 5, 0, implicit $exec :: (load (s8) from %ir.p4 + 5, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_15:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE21]], 8, [[GLOBAL_LOAD_UBYTE20]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE22:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE3]], 6, 0, implicit $exec :: (load (s8) from %ir.p4 + 6, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE23:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE3]], 7, 0, implicit $exec :: (load (s8) from %ir.p4 + 7, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_16:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE23]], 8, [[GLOBAL_LOAD_UBYTE22]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (load (s16) from %ir.p4, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[REG_SEQUENCE3]], 2, 0, implicit $exec :: (load (s16) from %ir.p4 + 2, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_17:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_USHORT1]], 16, [[GLOBAL_LOAD_USHORT]], implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE1:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.12, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.12, addrspace 5) + ; CHECK-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_LSHL_OR_B32_e64_17]], [[SI_SPILL_V32_RESTORE1]], 0, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE3]], [[V_ADD_U32_e64_3]], 0, 0, implicit $exec :: (store (s32) into %ir.p4, addrspace 1) + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + ; CHECK-NEXT: [[COPY22:%[0-9]+]]:sreg_32_xexec_hi = COPY $sgpr32 + ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY22]], 1024, implicit-def dead $scc + ; CHECK-NEXT: $sgpr32 = COPY [[S_ADD_I32_]] + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE2]], 0, 0, implicit $exec :: (load (s64) from %ir.p6, align 4, addrspace 1) + ; CHECK-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX2_]].sub0 + ; CHECK-NEXT: [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX2_]].sub1 + ; CHECK-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nsw V_LSHLREV_B32_e64 2, [[GLOBAL_LOAD_DWORDX2_]].sub0, implicit $exec + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SVS [[PHI]], [[V_LSHLREV_B32_e64_]], [[COPY22]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.arrayidx11, addrspace 5) + ; CHECK-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_LSHLREV_B32_e64 2, [[GLOBAL_LOAD_DWORDX2_]].sub1, implicit $exec + ; CHECK-NEXT: SCRATCH_STORE_SHORT_SVS [[V_LSHL_OR_B32_e64_7]], [[V_LSHLREV_B32_e64_1]], [[COPY22]], 2, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %ir.arrayidx33 + 2, addrspace 5) + ; CHECK-NEXT: SCRATCH_STORE_SHORT_SVS [[V_LSHL_OR_B32_e64_6]], [[V_LSHLREV_B32_e64_1]], [[COPY22]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %ir.arrayidx33, addrspace 5) + ; CHECK-NEXT: [[S_XOR_B32_1:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def dead $scc + ; CHECK-NEXT: [[S_XOR_B32_2:%[0-9]+]]:sreg_32 = S_XOR_B32 [[S_XOR_B32_1]], -1, implicit-def dead $scc + ; CHECK-NEXT: [[SI_IF2:%[0-9]+]]:sreg_32 = SI_IF [[S_XOR_B32_2]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5.Flow18: + ; CHECK-NEXT: successors: %bb.6(0x40000000), %bb.13(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %222:vgpr_32, %bb.4, %27, %bb.10 + ; CHECK-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[V_LSHL_OR_B32_e64_5]], %bb.4, undef %536:vgpr_32, %bb.10 + ; CHECK-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[V_ADD_U32_e64_2]], %bb.4, undef %538:vgpr_32, %bb.10 + ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.10, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.10, align 4, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE1:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.11, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.11, align 4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[DS_READ_B32_gfx9_]], %stack.16, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.16, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V64_SAVE [[REG_SEQUENCE3]], %stack.17, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.17, align 4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V64_SAVE %231, %stack.18, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.18, align 4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V64_SAVE [[REG_SEQUENCE2]], %stack.20, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.20, align 4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[V_LSHL_OR_B32_e64_17]], %stack.21, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.21, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[COPY23]], %stack.22, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.22, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[COPY24]], %stack.23, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.23, addrspace 5) + ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_256 = REG_SEQUENCE undef [[SI_SPILL_V128_RESTORE]].sub0, %subreg.sub0, undef [[SI_SPILL_V128_RESTORE]].sub1, %subreg.sub1, undef [[SI_SPILL_V128_RESTORE]].sub2, %subreg.sub2, [[SI_SPILL_V128_RESTORE]].sub3, %subreg.sub3, undef [[SI_SPILL_V128_RESTORE1]].sub0, %subreg.sub4, [[SI_SPILL_V128_RESTORE1]].sub1, %subreg.sub5, undef [[SI_SPILL_V128_RESTORE1]].sub2, %subreg.sub6, undef [[SI_SPILL_V128_RESTORE1]].sub3, %subreg.sub7 + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_18:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_10]], 16, [[V_LSHL_OR_B32_e64_9]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_19:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_12]], 16, [[V_LSHL_OR_B32_e64_11]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_20:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_14]], 16, [[V_LSHL_OR_B32_e64_13]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_21:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_16]], 16, [[V_LSHL_OR_B32_e64_15]], implicit $exec + ; CHECK-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE [[SI_IF2]], %bb.13, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.6 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.6.bb6: + ; CHECK-NEXT: successors: %bb.9(0x40000000), %bb.7(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def dead $scc + ; CHECK-NEXT: [[S_XOR_B32_3:%[0-9]+]]:sreg_32 = S_XOR_B32 [[S_AND_B32_]], -1, implicit-def dead $scc + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET2:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array2, target-flags(amdgpu-gotprel32-hi) @array2, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET2]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: [[COPY25:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM2]] + ; CHECK-NEXT: [[FLAT_LOAD_DWORD2:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY25]], 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array2, i64 20)`) + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET3:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array4, target-flags(amdgpu-gotprel32-hi) @array4, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM3:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET3]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: [[COPY26:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM3]] + ; CHECK-NEXT: FLAT_STORE_DWORD [[COPY26]], [[FLAT_LOAD_DWORD2]], 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr getelementptr inbounds nuw (i8, ptr @array4, i64 4)`) + ; CHECK-NEXT: [[SI_IF3:%[0-9]+]]:sreg_32 = SI_IF [[S_XOR_B32_3]], %bb.7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.9 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.7.Flow16: + ; CHECK-NEXT: successors: %bb.8(0x40000000), %bb.11(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI4:%[0-9]+]]:vgpr_32 = PHI undef %329:vgpr_32, %bb.6, %26, %bb.9 + ; CHECK-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[PHI2]], %bb.6, undef %540:vgpr_32, %bb.9 + ; CHECK-NEXT: [[PHI6:%[0-9]+]]:vgpr_32 = PHI [[PHI3]], %bb.6, undef %542:vgpr_32, %bb.9 + ; CHECK-NEXT: [[SI_ELSE1:%[0-9]+]]:sreg_32 = SI_ELSE [[SI_IF3]], %bb.11, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.8.bb8: + ; CHECK-NEXT: successors: %bb.11(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI5]], [[PHI6]], 0, implicit $exec + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET4:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array2, target-flags(amdgpu-gotprel32-hi) @array2, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM4:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET4]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: [[COPY27:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM4]] + ; CHECK-NEXT: [[FLAT_LOAD_DWORD3:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY27]], 28, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array2, i64 28)`) + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET5:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array3, target-flags(amdgpu-gotprel32-hi) @array3, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM5:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET5]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: [[COPY28:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM5]] + ; CHECK-NEXT: FLAT_STORE_DWORD [[COPY28]], [[FLAT_LOAD_DWORD3]], 68, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr getelementptr inbounds nuw (i8, ptr @array3, i64 68)`) + ; CHECK-NEXT: S_BRANCH %bb.11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.9.bb9: + ; CHECK-NEXT: successors: %bb.7(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[PHI2]], [[PHI3]], implicit $exec + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET6:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array5, target-flags(amdgpu-gotprel32-hi) @array5, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM6:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET6]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: [[COPY29:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM6]] + ; CHECK-NEXT: [[FLAT_LOAD_DWORD4:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY29]], 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array5, i64 20)`) + ; CHECK-NEXT: [[COPY30:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM3]] + ; CHECK-NEXT: FLAT_STORE_DWORD [[COPY30]], [[FLAT_LOAD_DWORD4]], 60, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr getelementptr inbounds nuw (i8, ptr @array4, i64 60)`) + ; CHECK-NEXT: S_BRANCH %bb.7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.10.bb7: + ; CHECK-NEXT: successors: %bb.5(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_SUB_U32_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[DS_READ_B32_gfx9_]], [[V_ADD_U32_e64_2]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_SUB_U32_e64_]], [[V_LSHL_OR_B32_e64_8]], implicit $exec + ; CHECK-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[SI_SPILL_V32_RESTORE]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_RCP_IFLAG_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_IFLAG_F32_e64 0, [[V_CVT_F32_U32_e64_]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_MUL_F32_e64 0, 1333788670, 0, [[V_RCP_IFLAG_F32_e64_]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_CVT_U32_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, [[V_MUL_F32_e64_]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE2:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.13, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.13, addrspace 5) + ; CHECK-NEXT: [[V_SUB_U32_e64_1:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 0, [[SI_SPILL_V32_RESTORE2]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_SUB_U32_e64_1]], [[V_CVT_U32_F32_e64_]], implicit $exec + ; CHECK-NEXT: [[V_MUL_HI_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[V_CVT_U32_F32_e64_]], [[V_MUL_LO_U32_e64_2]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_CVT_U32_F32_e64_]], [[V_MUL_HI_U32_e64_]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_HI_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[V_MUL_LO_U32_e64_1]], [[V_ADD_U32_e64_5]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_MUL_HI_U32_e64_1]], [[SI_SPILL_V32_RESTORE2]], implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_2:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_MUL_LO_U32_e64_1]], [[V_MUL_LO_U32_e64_3]], 0, implicit $exec + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_GE_U32_e64 [[V_SUB_U32_e64_2]], [[SI_SPILL_V32_RESTORE2]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 1, [[V_MUL_HI_U32_e64_1]], 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MUL_HI_U32_e64_1]], 0, [[V_ADD_U32_e64_6]], [[V_CMP_GE_U32_e64_]], implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_3:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_SUB_U32_e64_2]], [[SI_SPILL_V32_RESTORE2]], 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_SUB_U32_e64_2]], 0, [[V_SUB_U32_e64_3]], [[V_CMP_GE_U32_e64_]], implicit $exec + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_GE_U32_e64 [[V_CNDMASK_B32_e64_1]], [[SI_SPILL_V32_RESTORE2]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 1, [[V_CNDMASK_B32_e64_]], 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_CNDMASK_B32_e64_]], 0, [[V_ADD_U32_e64_7]], [[V_CMP_GE_U32_e64_1]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_CNDMASK_B32_e64_2]], [[V_LSHL_OR_B32_e64_5]], 0, implicit $exec + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET7:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array3, target-flags(amdgpu-gotprel32-hi) @array3, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM7:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET7]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: [[COPY31:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM7]] + ; CHECK-NEXT: [[FLAT_LOAD_DWORD5:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY31]], 84, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array3, i64 84)`) + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET8:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array4, target-flags(amdgpu-gotprel32-hi) @array4, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM8:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET8]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: [[COPY32:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM8]] + ; CHECK-NEXT: FLAT_STORE_DWORD [[COPY32]], [[FLAT_LOAD_DWORD5]], 60, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr getelementptr inbounds nuw (i8, ptr @array4, i64 60)`) + ; CHECK-NEXT: S_BRANCH %bb.5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.11.Flow17: + ; CHECK-NEXT: successors: %bb.13(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI [[PHI4]], %bb.7, [[V_ADD_U32_e64_4]], %bb.8 + ; CHECK-NEXT: SI_END_CF [[SI_ELSE1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.13 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.12.Flow20: + ; CHECK-NEXT: successors: %bb.21(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI8:%[0-9]+]]:vgpr_32 = PHI %69, %bb.18, %65, %bb.17 + ; CHECK-NEXT: [[PHI9:%[0-9]+]]:vgpr_32 = PHI %70, %bb.18, %65, %bb.17 + ; CHECK-NEXT: [[PHI10:%[0-9]+]]:vgpr_32 = PHI %71, %bb.18, %68, %bb.17 + ; CHECK-NEXT: [[PHI11:%[0-9]+]]:vgpr_32 = PHI %72, %bb.18, %67, %bb.17 + ; CHECK-NEXT: [[PHI12:%[0-9]+]]:vgpr_32 = PHI %73, %bb.18, %64, %bb.17 + ; CHECK-NEXT: [[PHI13:%[0-9]+]]:vgpr_32 = PHI %74, %bb.18, %66, %bb.17 + ; CHECK-NEXT: [[PHI14:%[0-9]+]]:vgpr_32 = PHI %75, %bb.18, %65, %bb.17 + ; CHECK-NEXT: [[PHI15:%[0-9]+]]:vgpr_32 = PHI %76, %bb.18, %63, %bb.17 + ; CHECK-NEXT: SI_END_CF %77, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE3:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.14, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.14, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE4:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.15, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.15, addrspace 5) + ; CHECK-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[SI_SPILL_V32_RESTORE4]], %subreg.sub0, [[SI_SPILL_V32_RESTORE3]], %subreg.sub1 + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE5:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE6:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) + ; CHECK-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[SI_SPILL_V32_RESTORE5]], %subreg.sub0, [[SI_SPILL_V32_RESTORE6]], %subreg.sub1 + ; CHECK-NEXT: S_BRANCH %bb.21 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.13.bb10: + ; CHECK-NEXT: successors: %bb.16(0x40000000), %bb.14(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI16:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.5, [[PHI7]], %bb.11 + ; CHECK-NEXT: SI_END_CF [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 100, [[V_LSHL_OR_B32_e64_8]], 0, implicit $exec + ; CHECK-NEXT: [[COPY33:%[0-9]+]]:vgpr_32 = COPY [[SI_SPILL_V128_RESTORE]].sub1 + ; CHECK-NEXT: [[COPY34:%[0-9]+]]:vgpr_32 = COPY [[SI_SPILL_V128_RESTORE]].sub2 + ; CHECK-NEXT: [[COPY35:%[0-9]+]]:vgpr_32 = COPY [[SI_SPILL_V128_RESTORE1]].sub2 + ; CHECK-NEXT: [[COPY36:%[0-9]+]]:vgpr_32 = COPY [[SI_SPILL_V128_RESTORE1]].sub3 + ; CHECK-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 [[SI_SPILL_V128_RESTORE]].sub1, [[SI_SPILL_V128_RESTORE1]].sub3, [[SI_SPILL_V128_RESTORE1]].sub2, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE7:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[SI_SPILL_V128_RESTORE]].sub2, %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; CHECK-NEXT: [[V_LSHLREV_B64_pseudo_e64_:%[0-9]+]]:vreg_64 = nsw V_LSHLREV_B64_pseudo_e64 2, [[REG_SEQUENCE7]], implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.9, align 4, addrspace 5) + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[SI_SPILL_V64_RESTORE]].sub0, [[V_LSHLREV_B64_pseudo_e64_]].sub0, 0, implicit $exec + ; CHECK-NEXT: %466:vgpr_32, dead $sgpr_null = V_ADDC_U32_e64 [[SI_SPILL_V64_RESTORE]].sub1, [[V_LSHLREV_B64_pseudo_e64_]].sub1, [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE8:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %466, %subreg.sub1 + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE8]], [[V_ADD3_U32_e64_]], 0, 0, implicit $exec :: (store (s32) into %ir.arrayidx1, addrspace 1) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE7:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.7, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE8:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE1:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.18, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.18, align 4, addrspace 5) + ; CHECK-NEXT: [[SI_IF4:%[0-9]+]]:sreg_32 = SI_IF [[S_XOR_B32_2]], %bb.14, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.16 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.14.Flow: + ; CHECK-NEXT: successors: %bb.15(0x40000000), %bb.17(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI17:%[0-9]+]]:vgpr_32 = PHI undef %356:vgpr_32, %bb.13, %51, %bb.16 + ; CHECK-NEXT: [[PHI18:%[0-9]+]]:vgpr_32 = PHI undef %356:vgpr_32, %bb.13, %50, %bb.16 + ; CHECK-NEXT: [[PHI19:%[0-9]+]]:vgpr_32 = PHI undef %356:vgpr_32, %bb.13, %52, %bb.16 + ; CHECK-NEXT: [[PHI20:%[0-9]+]]:vgpr_32 = PHI [[V_LSHL_OR_B32_e64_19]], %bb.13, undef %544:vgpr_32, %bb.16 + ; CHECK-NEXT: [[PHI21:%[0-9]+]]:vgpr_32 = PHI [[V_LSHL_OR_B32_e64_18]], %bb.13, undef %546:vgpr_32, %bb.16 + ; CHECK-NEXT: [[PHI22:%[0-9]+]]:vgpr_32 = PHI [[SI_SPILL_V32_RESTORE8]], %bb.13, undef %548:vgpr_32, %bb.16 + ; CHECK-NEXT: [[PHI23:%[0-9]+]]:vgpr_32 = PHI [[SI_SPILL_V32_RESTORE7]], %bb.13, undef %550:vgpr_32, %bb.16 + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE9:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE10:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5) + ; CHECK-NEXT: [[REG_SEQUENCE9:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[SI_SPILL_V32_RESTORE9]], %subreg.sub0, [[SI_SPILL_V32_RESTORE10]], %subreg.sub1 + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE11:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.21, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.21, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE12:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.22, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.22, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE13:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.23, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.23, addrspace 5) + ; CHECK-NEXT: [[SI_ELSE2:%[0-9]+]]:sreg_32 = SI_ELSE [[SI_IF4]], %bb.17, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.15 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.15.bb11: + ; CHECK-NEXT: successors: %bb.17(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_ADD_U32_e64_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI20]], [[PHI16]], 0, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE10:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[PHI21]], %subreg.sub0, [[V_MOV_B32_e32_2]], %subreg.sub1 + ; CHECK-NEXT: [[V_LSHLREV_B64_pseudo_e64_1:%[0-9]+]]:vreg_64 = nsw V_LSHLREV_B64_pseudo_e64 2, [[REG_SEQUENCE10]], implicit $exec + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[PHI22]], [[V_LSHLREV_B64_pseudo_e64_1]].sub0, 0, implicit $exec + ; CHECK-NEXT: %474:vgpr_32, dead $sgpr_null = V_ADDC_U32_e64 [[PHI23]], [[V_LSHLREV_B64_pseudo_e64_1]].sub1, [[V_ADD_CO_U32_e64_3]], 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE11:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_2]], %subreg.sub0, %474, %subreg.sub1 + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE11]], [[V_ADD_U32_e64_10]], 0, 0, implicit $exec :: (store (s32) into %ir.arrayidx2, align 8, addrspace 1) + ; CHECK-NEXT: S_BRANCH %bb.17 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.16.bb12: + ; CHECK-NEXT: successors: %bb.14(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_ADD3_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 [[V_ADD_U32_e64_9]], [[PHI16]], [[REG_SEQUENCE4]].sub3, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE12:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[REG_SEQUENCE4]].sub5, %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; CHECK-NEXT: [[V_LSHLREV_B64_pseudo_e64_2:%[0-9]+]]:vreg_64 = nsw V_LSHLREV_B64_pseudo_e64 2, [[REG_SEQUENCE12]], implicit $exec + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_4:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_5:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[SI_SPILL_V32_RESTORE8]], [[V_LSHLREV_B64_pseudo_e64_2]].sub0, 0, implicit $exec + ; CHECK-NEXT: %482:vgpr_32, dead $sgpr_null = V_ADDC_U32_e64 [[SI_SPILL_V32_RESTORE7]], [[V_LSHLREV_B64_pseudo_e64_2]].sub1, [[V_ADD_CO_U32_e64_5]], 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE13:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_4]], %subreg.sub0, %482, %subreg.sub1 + ; CHECK-NEXT: GLOBAL_STORE_SHORT_D16_HI [[REG_SEQUENCE13]], [[V_ADD3_U32_e64_1]], 2, 0, implicit $exec :: (store (s16) into %ir.arrayidx3 + 2, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_SHORT [[REG_SEQUENCE13]], [[V_ADD3_U32_e64_1]], 0, 0, implicit $exec :: (store (s16) into %ir.arrayidx3, addrspace 1) + ; CHECK-NEXT: [[V_ADD_U32_e64_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_LSHL_OR_B32_e64_21]], [[V_ADD3_U32_e64_1]], 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE14:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_LSHL_OR_B32_e64_20]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; CHECK-NEXT: [[V_LSHLREV_B64_pseudo_e64_3:%[0-9]+]]:vreg_64 = nsw V_LSHLREV_B64_pseudo_e64 2, [[REG_SEQUENCE14]], implicit $exec + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_6:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_7:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %513, [[V_LSHLREV_B64_pseudo_e64_3]].sub0, 0, implicit $exec + ; CHECK-NEXT: %490:vgpr_32, dead $sgpr_null = V_ADDC_U32_e64 %515, [[V_LSHLREV_B64_pseudo_e64_3]].sub1, [[V_ADD_CO_U32_e64_7]], 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE15:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_6]], %subreg.sub0, %490, %subreg.sub1 + ; CHECK-NEXT: GLOBAL_STORE_SHORT_D16_HI [[REG_SEQUENCE15]], [[V_ADD_U32_e64_11]], 2, 0, implicit $exec :: (store (s16) into %ir.arrayidx5 + 2, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_SHORT [[REG_SEQUENCE15]], [[V_ADD_U32_e64_11]], 0, 0, implicit $exec :: (store (s16) into %ir.arrayidx5, addrspace 1) + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + ; CHECK-NEXT: [[COPY37:%[0-9]+]]:sreg_32_xexec_hi = COPY $sgpr32 + ; CHECK-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY37]], 1024, implicit-def dead $scc + ; CHECK-NEXT: $sgpr32 = COPY [[S_ADD_I32_1]] + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE2:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.20, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.20, align 4, addrspace 5) + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[SI_SPILL_V64_RESTORE2]], 0, 0, implicit $exec :: (load (s32) from %ir.p6, addrspace 1) + ; CHECK-NEXT: [[V_LSHLREV_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 2, [[GLOBAL_LOAD_DWORD]], implicit $exec + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SVS [[V_ADD3_U32_e64_1]], [[V_LSHLREV_B32_e64_2]], [[COPY37]], 40, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.arrayidx1111, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE3:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.17, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.17, align 4, addrspace 5) + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[SI_SPILL_V64_RESTORE3]], [[SI_SPILL_V64_RESTORE1]].sub0, 4, 0, implicit $exec :: (store (s32) into %ir.arrayidx444, addrspace 1) + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET9:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array4, target-flags(amdgpu-gotprel32-hi) @array4, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM9:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET9]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: [[COPY38:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM9]] + ; CHECK-NEXT: [[FLAT_LOAD_DWORD6:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY38]], 0, 0, implicit $exec, implicit $flat_scr :: (dereferenceable load (s32) from @array4) + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET10:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array2, target-flags(amdgpu-gotprel32-hi) @array2, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM10:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET10]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: [[COPY39:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM10]] + ; CHECK-NEXT: [[FLAT_LOAD_DWORD7:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY39]], 0, 0, implicit $exec, implicit $flat_scr :: (dereferenceable load (s32) from @array2) + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET11:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array3, target-flags(amdgpu-gotprel32-hi) @array3, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM11:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET11]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: [[COPY40:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM11]] + ; CHECK-NEXT: [[FLAT_LOAD_DWORD8:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY40]], 0, 0, implicit $exec, implicit $flat_scr :: (dereferenceable load (s32) from @array3) + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET12:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array5, target-flags(amdgpu-gotprel32-hi) @array5, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM12:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET12]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: [[COPY41:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM12]] + ; CHECK-NEXT: [[FLAT_LOAD_DWORD9:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY41]], 0, 0, implicit $exec, implicit $flat_scr :: (dereferenceable load (s32) from @array5) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_4:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[FLAT_LOAD_DWORD8]], [[FLAT_LOAD_DWORD9]], implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE16:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MUL_LO_U32_e64_4]], %subreg.sub0, undef %455:vgpr_32, %subreg.sub1 + ; CHECK-NEXT: %406:vreg_64, $sgpr_null = V_MAD_U64_U32_e64 [[FLAT_LOAD_DWORD6]], [[FLAT_LOAD_DWORD7]], [[REG_SEQUENCE16]], 0, implicit $exec + ; CHECK-NEXT: [[COPY42:%[0-9]+]]:vgpr_32 = COPY %406.sub0 + ; CHECK-NEXT: S_BRANCH %bb.14 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.17.bb13: + ; CHECK-NEXT: successors: %bb.12(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI24:%[0-9]+]]:vgpr_32 = PHI [[PHI19]], %bb.14, [[V_ADD_U32_e64_10]], %bb.15 + ; CHECK-NEXT: [[PHI25:%[0-9]+]]:vgpr_32 = PHI [[PHI18]], %bb.14, [[V_ADD_U32_e64_10]], %bb.15 + ; CHECK-NEXT: [[PHI26:%[0-9]+]]:vgpr_32 = PHI [[SI_SPILL_V32_RESTORE13]], %bb.14, [[V_ADD_U32_e64_10]], %bb.15 + ; CHECK-NEXT: [[PHI27:%[0-9]+]]:vgpr_32 = PHI [[SI_SPILL_V32_RESTORE12]], %bb.14, [[V_ADD_U32_e64_10]], %bb.15 + ; CHECK-NEXT: [[PHI28:%[0-9]+]]:vgpr_32 = PHI [[PHI17]], %bb.14, [[V_ADD_U32_e64_10]], %bb.15 + ; CHECK-NEXT: [[PHI29:%[0-9]+]]:vgpr_32 = PHI [[COPY33]], %bb.14, [[V_ADD_U32_e64_10]], %bb.15 + ; CHECK-NEXT: [[PHI30:%[0-9]+]]:vgpr_32 = PHI [[COPY34]], %bb.14, [[V_ADD_U32_e64_10]], %bb.15 + ; CHECK-NEXT: [[PHI31:%[0-9]+]]:vgpr_32 = PHI [[COPY35]], %bb.14, [[V_ADD_U32_e64_10]], %bb.15 + ; CHECK-NEXT: [[PHI32:%[0-9]+]]:vgpr_32 = PHI [[COPY36]], %bb.14, [[V_ADD_U32_e64_10]], %bb.15 + ; CHECK-NEXT: [[PHI33:%[0-9]+]]:vgpr_32 = PHI [[SI_SPILL_V32_RESTORE11]], %bb.14, [[V_ADD_U32_e64_10]], %bb.15 + ; CHECK-NEXT: SI_END_CF [[SI_ELSE2]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI24]], [[PHI16]], 0, implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE14:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE15:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.19, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.19, addrspace 5) + ; CHECK-NEXT: [[V_SUB_U32_e64_4:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[SI_SPILL_V32_RESTORE15]], [[SI_SPILL_V32_RESTORE14]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_12]], [[V_SUB_U32_e64_4]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_5:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_ADD_U32_e64_13]], [[V_ADD_U32_e64_9]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MUL_LO_U32_e64_5]], [[SI_SPILL_V64_RESTORE1]].sub0, 0, implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE16:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.16, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.16, addrspace 5) + ; CHECK-NEXT: [[V_SUB_U32_e64_5:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_ADD_U32_e64_14]], [[SI_SPILL_V32_RESTORE16]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_SUB_U32_e64_5]], [[PHI25]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_6:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_ADD_U32_e64_15]], [[PHI26]], implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_6:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_MUL_LO_U32_e64_6]], [[PHI27]], 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE17:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[PHI29]], %subreg.sub0, undef %453:vgpr_32, %subreg.sub1 + ; CHECK-NEXT: %422:vreg_64, $sgpr_null = V_MAD_U64_U32_e64 [[V_SUB_U32_e64_6]], [[PHI28]], [[REG_SEQUENCE17]], 0, implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_7:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 %422.sub0, [[PHI30]], 0, implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_8:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_SUB_U32_e64_7]], [[PHI31]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_SUB_U32_e64_8]], [[PHI32]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_7:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_ADD_U32_e64_16]], [[PHI33]], implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_SHORT_D16_HI [[REG_SEQUENCE9]], [[V_MUL_LO_U32_e64_7]], 2, 0, implicit $exec :: (store (s16) into %ir.p7 + 2, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_SHORT [[REG_SEQUENCE9]], [[V_MUL_LO_U32_e64_7]], 0, 0, implicit $exec :: (store (s16) into %ir.p7, addrspace 1) + ; CHECK-NEXT: S_BRANCH %bb.12 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.18.Flow19: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.12(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI34:%[0-9]+]]:vgpr_32 = PHI undef %172:vgpr_32, %bb.2, %91, %bb.20 + ; CHECK-NEXT: [[PHI35:%[0-9]+]]:vgpr_32 = PHI undef %172:vgpr_32, %bb.2, %90, %bb.20 + ; CHECK-NEXT: [[PHI36:%[0-9]+]]:vgpr_32 = PHI undef %172:vgpr_32, %bb.2, %89, %bb.20 + ; CHECK-NEXT: [[PHI37:%[0-9]+]]:vgpr_32 = PHI undef %172:vgpr_32, %bb.2, %88, %bb.20 + ; CHECK-NEXT: [[PHI38:%[0-9]+]]:vgpr_32 = PHI undef %172:vgpr_32, %bb.2, %87, %bb.20 + ; CHECK-NEXT: [[PHI39:%[0-9]+]]:vgpr_32 = PHI undef %172:vgpr_32, %bb.2, %86, %bb.20 + ; CHECK-NEXT: [[PHI40:%[0-9]+]]:vgpr_32 = PHI undef %172:vgpr_32, %bb.2, %85, %bb.20 + ; CHECK-NEXT: [[PHI41:%[0-9]+]]:vgpr_32 = PHI undef %172:vgpr_32, %bb.2, %84, %bb.20 + ; CHECK-NEXT: [[PHI42:%[0-9]+]]:vgpr_32 = PHI [[V_LSHL_OR_B32_e64_2]], %bb.2, undef %498:vgpr_32, %bb.20 + ; CHECK-NEXT: [[PHI43:%[0-9]+]]:vgpr_32 = PHI [[V_ADD_U32_e64_]], %bb.2, undef %500:vgpr_32, %bb.20 + ; CHECK-NEXT: [[PHI44:%[0-9]+]]:vgpr_32 = PHI [[V_ADD_U32_e64_1]], %bb.2, undef %502:vgpr_32, %bb.20 + ; CHECK-NEXT: [[PHI45:%[0-9]+]]:vgpr_32 = PHI [[COPY17]], %bb.2, undef %504:vgpr_32, %bb.20 + ; CHECK-NEXT: [[PHI46:%[0-9]+]]:vgpr_32 = PHI [[COPY12]], %bb.2, undef %506:vgpr_32, %bb.20 + ; CHECK-NEXT: [[PHI47:%[0-9]+]]:vgpr_32 = PHI [[COPY11]], %bb.2, undef %508:vgpr_32, %bb.20 + ; CHECK-NEXT: [[PHI48:%[0-9]+]]:vgpr_32 = PHI [[COPY10]], %bb.2, undef %510:vgpr_32, %bb.20 + ; CHECK-NEXT: [[PHI49:%[0-9]+]]:vgpr_32 = PHI [[COPY9]], %bb.2, undef %512:vgpr_32, %bb.20 + ; CHECK-NEXT: [[PHI50:%[0-9]+]]:vgpr_32 = PHI [[COPY8]], %bb.2, undef %514:vgpr_32, %bb.20 + ; CHECK-NEXT: [[PHI51:%[0-9]+]]:vgpr_32 = PHI [[COPY7]], %bb.2, undef %516:vgpr_32, %bb.20 + ; CHECK-NEXT: [[PHI52:%[0-9]+]]:vgpr_32 = PHI [[COPY6]], %bb.2, undef %518:vgpr_32, %bb.20 + ; CHECK-NEXT: [[PHI53:%[0-9]+]]:vgpr_32 = PHI [[COPY5]], %bb.2, undef %520:vgpr_32, %bb.20 + ; CHECK-NEXT: [[PHI54:%[0-9]+]]:vgpr_32 = PHI [[GLOBAL_LOAD_UBYTE4]], %bb.2, undef %522:vgpr_32, %bb.20 + ; CHECK-NEXT: [[PHI55:%[0-9]+]]:vgpr_32 = PHI [[GLOBAL_LOAD_UBYTE5]], %bb.2, undef %524:vgpr_32, %bb.20 + ; CHECK-NEXT: [[PHI56:%[0-9]+]]:vgpr_32 = PHI [[GLOBAL_LOAD_UBYTE6]], %bb.2, undef %526:vgpr_32, %bb.20 + ; CHECK-NEXT: [[PHI57:%[0-9]+]]:vgpr_32 = PHI [[GLOBAL_LOAD_UBYTE7]], %bb.2, undef %528:vgpr_32, %bb.20 + ; CHECK-NEXT: [[PHI58:%[0-9]+]]:vreg_128 = PHI [[GLOBAL_LOAD_DWORDX4_]], %bb.2, undef %530:vreg_128, %bb.20 + ; CHECK-NEXT: [[PHI59:%[0-9]+]]:vreg_128 = PHI [[GLOBAL_LOAD_DWORDX4_1]], %bb.2, undef %532:vreg_128, %bb.20 + ; CHECK-NEXT: [[PHI60:%[0-9]+]]:vreg_64 = PHI [[REG_SEQUENCE1]], %bb.2, undef %534:vreg_64, %bb.20 + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[PHI42]], %stack.13, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.13, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[PHI43]], %stack.12, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.12, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V128_SAVE [[PHI58]], %stack.11, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.11, align 4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V128_SAVE [[PHI59]], %stack.10, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.10, align 4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V64_SAVE [[PHI60]], %stack.9, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.9, align 4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[PHI48]], %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[PHI49]], %stack.7, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[PHI53]], %stack.6, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[PHI52]], %stack.5, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[PHI44]], %stack.4, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[COPY1]], %stack.14, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.14, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[COPY2]], %stack.15, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.15, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[PHI]], %stack.19, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.19, addrspace 5) + ; CHECK-NEXT: [[SI_ELSE3:%[0-9]+]]:sreg_32 = SI_ELSE [[SI_IF1]], %bb.12, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.19.bb4: + ; CHECK-NEXT: successors: %bb.20(0x04000000), %bb.19(0x7c000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI61:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_1]], %bb.3, %81, %bb.19 + ; CHECK-NEXT: [[PHI62:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.3, %80, %bb.19 + ; CHECK-NEXT: [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI62]], 1, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI62]], 2, implicit-def dead $scc + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_2:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 [[S_ADD_I32_3]], [[COPY]], implicit $exec + ; CHECK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[V_CMP_GE_U32_e64_2]], [[PHI61]], implicit-def dead $scc + ; CHECK-NEXT: [[COPY43:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_2]], implicit $exec + ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.19, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.20 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.20.bb14.loopexit: + ; CHECK-NEXT: successors: %bb.18(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[FLAT_LOAD_DWORD]], [[COPY43]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[FLAT_LOAD_DWORD1]], [[COPY43]], 0, implicit $exec + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET13:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array7, target-flags(amdgpu-gotprel32-hi) @array7, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM13:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET13]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: [[COPY44:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM13]] + ; CHECK-NEXT: FLAT_STORE_DWORD [[COPY44]], [[V_ADD_U32_e64_17]], 68, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr getelementptr inbounds nuw (i8, ptr @array7, i64 68)`) + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET14:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array9, target-flags(amdgpu-gotprel32-hi) @array9, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM14:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET14]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: [[COPY45:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM14]] + ; CHECK-NEXT: FLAT_STORE_DWORD [[COPY45]], [[V_ADD_U32_e64_18]], 60, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr getelementptr inbounds nuw (i8, ptr @array9, i64 60)`) + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET15:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array6, target-flags(amdgpu-gotprel32-hi) @array6, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM15:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET15]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: [[COPY46:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM15]] + ; CHECK-NEXT: [[FLAT_LOAD_DWORD10:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY46]], 44, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array6, i64 44)`) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_8:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[FLAT_LOAD_DWORD10]], [[COPY43]], implicit $exec + ; CHECK-NEXT: [[FLAT_LOAD_DWORD11:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY44]], 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array7, i64 20)`) + ; CHECK-NEXT: [[V_SUB_U32_e64_9:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[FLAT_LOAD_DWORD11]], [[COPY43]], 0, implicit $exec + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET16:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array8, target-flags(amdgpu-gotprel32-hi) @array8, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM16:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET16]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: [[COPY47:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM16]] + ; CHECK-NEXT: [[FLAT_LOAD_DWORD12:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY47]], 44, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array8, i64 44)`, align 8) + ; CHECK-NEXT: [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[FLAT_LOAD_DWORD12]], [[COPY43]], 0, implicit $exec + ; CHECK-NEXT: [[FLAT_LOAD_DWORD13:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY45]], 24, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array9, i64 24)`) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_9:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[FLAT_LOAD_DWORD13]], [[COPY43]], implicit $exec + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET17:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array2, target-flags(amdgpu-gotprel32-hi) @array2, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM17:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET17]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: [[COPY48:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM17]] + ; CHECK-NEXT: [[FLAT_LOAD_DWORD14:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY48]], 80, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array2, i64 80)`) + ; CHECK-NEXT: [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[FLAT_LOAD_DWORD14]], [[COPY43]], 0, implicit $exec + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET18:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array3, target-flags(amdgpu-gotprel32-hi) @array3, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM18:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET18]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: [[COPY49:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM18]] + ; CHECK-NEXT: [[FLAT_LOAD_DWORD15:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY49]], 80, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array3, i64 80)`) + ; CHECK-NEXT: [[V_SUB_U32_e64_10:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[FLAT_LOAD_DWORD15]], [[COPY43]], 0, implicit $exec + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET19:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array4, target-flags(amdgpu-gotprel32-hi) @array4, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM19:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET19]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: [[COPY50:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM19]] + ; CHECK-NEXT: [[FLAT_LOAD_DWORD16:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY50]], 80, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array4, i64 80)`, align 8) + ; CHECK-NEXT: [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[FLAT_LOAD_DWORD16]], [[COPY43]], 0, implicit $exec + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET20:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array5, target-flags(amdgpu-gotprel32-hi) @array5, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM20:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET20]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: [[COPY51:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM20]] + ; CHECK-NEXT: [[FLAT_LOAD_DWORD17:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY51]], 80, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array5, i64 80)`) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_10:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[FLAT_LOAD_DWORD17]], [[COPY43]], implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.18 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.21.bb14: + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE17:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.19, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.19, addrspace 5) + ; CHECK-NEXT: [[V_ADD3_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 [[PHI15]], [[SI_SPILL_V32_RESTORE17]], [[PHI14]], implicit $exec + ; CHECK-NEXT: [[V_ADD3_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 [[V_ADD3_U32_e64_2]], [[PHI13]], [[PHI12]], implicit $exec + ; CHECK-NEXT: [[V_ADD3_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 [[V_ADD3_U32_e64_3]], [[PHI11]], 100, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD3_U32_e64_4]], [[PHI10]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD3_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 [[V_ADD_U32_e64_22]], [[PHI9]], [[PHI8]], implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE6]], [[V_ADD3_U32_e64_5]], 4, 0, implicit $exec :: (store (s32) into %ir.gep3, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE5]], [[V_ADD3_U32_e64_4]], 4, 0, implicit $exec :: (store (s32) into %ir.gep4, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 +entry: + %ld1 = load i32, ptr addrspace(1) %p1, align 1 + %gep1 = getelementptr inbounds i32, ptr addrspace(1) %p1, i64 3 + %ld2 = load i32, ptr addrspace(1) %gep1, align 1 + %load1 = load i32, ptr addrspace(1) %p3, align 4 + %tmp1 = add i32 %load1, %ld1 + %load2 = load <8 x i32>, ptr addrspace(1) %p3, align 1 + store i32 %tmp1, ptr addrspace(1) %p3 + %add1 = add i32 %ld1, %tmp1 + br i1 %cond1, label %bb1, label %bb2 + +bb1: + br label %bb2 + +bb2: + %phi0 = phi i32 [ 100, %bb1 ], [ %add1, %entry ] + %ld3 = load i32, ptr addrspace(3) %p2, align 1 + %add2 = add i32 %ld3, 100 + br i1 %cond2, label %bb3, label %bb4 + +bb3: + %mul1 = mul i32 %ld1, %phi0 + %add3 = add i32 %mul1, 1000 + br label %bb5 + +bb5: + %add30 = add i32 %add3, 900 + %gep2 = getelementptr inbounds i32, ptr addrspace(3) %p2, i64 3 + %ld4 = load i32, ptr addrspace(3) %gep2, align 8 + %add5 = add i32 %ld4, %add30 + %load3 = load <8 x i32>, ptr addrspace(1) %p4, align 1 + %load4 = load i32, ptr addrspace(1) %p4, align 2 + %tmp2 = add i32 %load4, %tmp1 + store i32 %tmp2, ptr addrspace(1) %p4 + %stack = alloca [5 x i32], align 4, addrspace(5) + %load6 = load i32, ptr addrspace(1) %p6, align 4 + %arrayidx11 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %load6 + store i32 %phi0, ptr addrspace(5) %arrayidx11, align 4 + %arrayidx22 = getelementptr inbounds i32, ptr addrspace(1) %p6, i32 1 + %load7 = load i32, ptr addrspace(1) %arrayidx22, align 4 + %arrayidx33 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %load7 + store i32 %ld3, ptr addrspace(5) %arrayidx33, align 2 + %xor = xor i1 %cond1, %cond2 + br i1 %xor, label %bb6, label %bb7 + +bb6: + %and = and i1 %cond1, %cond2 + %idx10 = getelementptr inbounds [5 x i32], [5 x i32]* @array2, i64 1, i64 0 + %val0 = load i32, i32* %idx10, align 4 + %idx20 = getelementptr inbounds [5 x i32], [5 x i32]* @array4, i64 0, i64 1 + store i32 %val0, i32 *%idx20 + br i1 %and, label %bb8, label %bb9 + +bb8: + %add6 = add i32 %ld2, %add5 + %idx12 = getelementptr inbounds [5 x i32], [5 x i32]* @array2, i64 1, i64 2 + %val2 = load i32, i32* %idx12, align 4 + %idx22 = getelementptr inbounds [5 x i32], [5 x i32]* @array3, i64 3, i64 2 + store i32 %val2, i32 *%idx22 + br label %bb10 + +bb9: + %mul2 = mul i32 %ld2, %add5 + %idx13 = getelementptr inbounds [5 x i32], [5 x i32]* @array5, i64 1, i64 0 + %val3 = load i32, i32* %idx13, align 4 + %idx23 = getelementptr inbounds [5 x i32], [5 x i32]* @array4, i64 3, i64 0 + store i32 %val3, i32 *%idx23 + br label %bb10 + +bb7: + %sub1 = sub i32 %ld4, %add5 + %mul3 = mul i32 %sub1, %ld3 + %div = udiv i32 %mul3, %ld1 + %add7 = add i32 %div, %ld2 + %idx14 = getelementptr inbounds [5 x i32], [5 x i32]* @array3, i64 4, i64 1 + %val4 = load i32, i32* %idx14, align 4 + %idx24 = getelementptr inbounds [5 x i32], [5 x i32]* @array4, i64 3, i64 0 + store i32 %val4, i32 *%idx24 + br label %bb10 + +bb10: + %phi2 = phi i32 [ %add6, %bb8 ], [ %mul2, %bb9], [ %add7, %bb7 ] + %add8 = add i32 %add2, %phi2 + %extract1 = extractelement < 8 x i32> %load2, i32 1 + %extract2 = extractelement < 8 x i32> %load2, i32 2 + %extract3 = extractelement < 8 x i32> %load2, i32 6 + %extract4 = extractelement < 8 x i32> %load2, i32 7 + %add101 = add i32 %extract1, %extract4 + %add102 = add i32 %add101, %extract3 + %idx1 = zext i32 %extract2 to i64 + %arrayidx1 = getelementptr inbounds i32, ptr addrspace(1) %p1, i64 %idx1 + store i32 %add102, ptr addrspace(1) %arrayidx1, align 4 + %cond3 = icmp ne i1 %cond1, %cond2 + br i1 %cond3, label %bb11, label %bb12 + +bb11: + %extract5 = extractelement < 8 x i32> %load3, i32 3 + %extract6 = extractelement < 8 x i32> %load3, i32 5 + %tmp3 = add i32 %extract5, %phi2 + %idx2 = zext i32 %extract6 to i64 + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %p5, i64 %idx2 + store i32 %tmp3, ptr addrspace(1) %arrayidx2, align 8 + br label %bb13 + +bb12: + %extract7 = extractelement < 8 x i32> %load3, i32 1 + %extract8 = extractelement < 8 x i32> %load3, i32 2 + %extract9 = extractelement < 8 x i32> %load2, i32 3 + %extract10 = extractelement < 8 x i32> %load2, i32 5 + %tmp4 = add i32 %extract9, %add8 + %idx3 = zext i32 %extract10 to i64 + %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %p5, i64 %idx3 + store i32 %tmp4, ptr addrspace(1) %arrayidx3, align 2 + %tmp5 = add i32 %extract7, %tmp4 + %idx4 = zext i32 %extract8 to i64 + %arrayidx5 = getelementptr inbounds i32, ptr addrspace(1) %p6, i64 %idx4 + store i32 %tmp5, ptr addrspace(1) %arrayidx5, align 2 + %array1 = alloca [5 x i32], align 4, addrspace(5) + %load8 = load i32, ptr addrspace(1) %p6, align 4 + %arrayidx111 = getelementptr inbounds [5 x i32], ptr addrspace(5) %array1, i32 2, i32 %load8 + store i32 %tmp4, ptr addrspace(5) %arrayidx111, align 4 + %arrayidx222 = getelementptr inbounds i32, ptr addrspace(1) %p6, i32 1 + %load9 = load i32, ptr addrspace(1) %arrayidx222, align 4 + %arrayidx333 = getelementptr inbounds [5 x i32], ptr addrspace(5) %array1, i32 1, i32 %load9 + %load10 = load i32, ptr addrspace(5) %arrayidx333 + %arrayidx444 = getelementptr inbounds i32, ptr addrspace(1) %p4, i32 1 + store i32 %add30, ptr addrspace(1) %arrayidx444 + %idx15 = getelementptr inbounds [5 x i32], [5 x i32]* @array4, i64 0, i64 0 + %val5 = load i32, i32* %idx15, align 4 + %idx16 = getelementptr inbounds [5 x i32], [5 x i32]* @array2, i64 0, i64 0 + %val6 = load i32, i32* %idx16, align 4 + %idx17 = getelementptr inbounds [5 x i32], [5 x i32]* @array3, i64 0, i64 0 + %val7 = load i32, i32* %idx17, align 4 + %idx18 = getelementptr inbounds [5 x i32], [5 x i32]* @array5, i64 0, i64 0 + %val8 = load i32, i32* %idx18, align 4 + %mul10 = mul i32 %val5, %val6 + %mul11 = mul i32 %val7, %val8 + %add100 = add i32 %mul10, %mul11 + br label %bb13 + +bb13: + %phi3 = phi i32 [ %tmp3, %bb11 ], [ %add100, %bb12] + %phi4 = phi i32 [ %tmp3, %bb11 ], [ %val5, %bb12] + %phi5 = phi i32 [ %tmp3, %bb11 ], [ %load7, %bb12] + %phi6 = phi i32 [ %tmp3, %bb11 ], [ %load6, %bb12] + %phi7 = phi i32 [ %tmp3, %bb11 ], [ %val8, %bb12] + %phi8 = phi i32 [ %tmp3, %bb11 ], [ %extract1, %bb12] + %phi9 = phi i32 [ %tmp3, %bb11 ], [ %extract2, %bb12] + %phi10 = phi i32 [ %tmp3, %bb11 ], [ %extract3, %bb12] + %phi11 = phi i32 [ %tmp3, %bb11 ], [ %extract4, %bb12] + %phi12 = phi i32 [ %tmp3, %bb11 ], [ %load4, %bb12] + %add200 = add i32 %phi3, %phi2 + %add300 = sub i32 %phi0, %add1 + %add400 = add i32 %add200, %add300 + %add500 = mul i32 %add400, %add2 + %add600 = add i32 %add500, %add30 + %add700 = sub i32 %add600, %ld4 + %add800 = add i32 %add700, %phi4 + %add900 = mul i32 %add800, %phi5 + %add1000 = sub i32 %add900, %phi6 + %add1100 = mul i32 %add1000, %phi7 + %add1200 = add i32 %add1100, %phi8 + %add1300 = sub i32 %add1200, %phi9 + %add1400 = sub i32 %add1300, %phi10 + %add1500 = add i32 %add1400, %phi11 + %add1600 = mul i32 %add1500, %phi12 + store i32 %add1600, ptr addrspace(1) %p7, align 2 + br label %bb14 + +bb4: + %phi13 = phi i32 [ 0, %bb2 ], [ %ind, %bb4 ] + %idx600 = getelementptr inbounds [5 x i32], [5 x i32]* @array6, i64 1, i64 2 + %val600 = load i32, i32* %idx600, align 4 + %idx700 = getelementptr inbounds [5 x i32], [5 x i32]* @array7, i64 3, i64 2 + %addval600 = add i32 %val600, %phi13 + store i32 %addval600, i32 *%idx700 + %idx800 = getelementptr inbounds [5 x i32], [5 x i32]* @array8, i64 1, i64 0 + %val800 = load i32, i32* %idx800, align 4 + %idx900 = getelementptr inbounds [5 x i32], [5 x i32]* @array9, i64 3, i64 0 + %addval800 = add i32 %val800, %phi13 + store i32 %addval800, i32 *%idx900 + %idx601 = getelementptr inbounds [5 x i32], [5 x i32]* @array6, i64 2, i64 1 + %val601 = load i32, i32* %idx601, align 1 + %val611 = mul i32 %val601, %phi13 + %idx701 = getelementptr inbounds [5 x i32], [5 x i32]* @array7, i64 1, i64 0 + %val701 = load i32, i32* %idx701, align 2 + %val711 = sub i32 %val701, %phi13 + %idx801 = getelementptr inbounds [5 x i32], [5 x i32]* @array8, i64 2, i64 1 + %val801 = load i32, i32* %idx801, align 8 + %val811 = add i32 %val801, %phi13 + %idx901 = getelementptr inbounds [5 x i32], [5 x i32]* @array9, i64 1, i64 1 + %val901 = load i32, i32* %idx901, align 1 + %val911 = mul i32 %val901, %phi13 + %idx602 = getelementptr inbounds [5 x i32], [5 x i32]* @array2, i64 4, i64 0 + %val602 = load i32, i32* %idx602, align 1 + %val612 = add i32 %val602, %phi13 + %idx702 = getelementptr inbounds [5 x i32], [5 x i32]* @array3, i64 4, i64 0 + %val702 = load i32, i32* %idx702, align 2 + %val712 = sub i32 %val702, %phi13 + %idx802 = getelementptr inbounds [5 x i32], [5 x i32]* @array4, i64 4, i64 0 + %val802 = load i32, i32* %idx802, align 8 + %val812 = add i32 %val802, %phi13 + %idx902 = getelementptr inbounds [5 x i32], [5 x i32]* @array5, i64 4, i64 0 + %val902 = load i32, i32* %idx902, align 1 + %val912 = mul i32 %val902, %phi13 + %ind = add i32 %phi13, 1 + %loop.cond = icmp ult i32 %ind, %TC1 + br i1 %loop.cond, label %bb4, label %bb14 + +bb14: + %phi14 = phi i32 [ %add200, %bb13 ], [ %val611, %bb4 ] + %phi15 = phi i32 [ %add500, %bb13 ], [ %val711, %bb4 ] + %phi16 = phi i32 [ %add600, %bb13 ], [ %val811, %bb4 ] + %phi17 = phi i32 [ %add300, %bb13 ], [ %val911, %bb4 ] + %phi18 = phi i32 [ %add1000, %bb13 ], [ %val612, %bb4 ] + %phi19 = phi i32 [ %add1600, %bb13 ], [ %val712, %bb4 ] + %phi20 = phi i32 [ %add500, %bb13 ], [ %val812, %bb4 ] + %phi21 = phi i32 [ %add500, %bb13 ], [ %val912, %bb4 ] + %addall1 = add i32 %phi14, %phi0 + %addall2 = add i32 %addall1, %phi15 + %addall3 = add i32 %addall2, 100 + %addall4 = add i32 %addall3, %phi16 + %addall5 = add i32 %addall4, %phi17 + %addall6 = add i32 %addall5, %phi18 + %addall7 = add i32 %addall6, %phi19 + %addall8 = add i32 %addall7, %phi20 + %addall9 = add i32 %addall8, %phi21 + %gep3 = getelementptr inbounds i32, ptr addrspace(1) %p8, i64 1 + store i32 %addall9, ptr addrspace(1) %gep3 + %gep4 = getelementptr inbounds i32, ptr addrspace(1) %p9, i64 1 + store i32 %addall6, ptr addrspace(1) %gep4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/test_ers_nested_loops.ll b/llvm/test/CodeGen/AMDGPU/test_ers_nested_loops.ll new file mode 100644 index 0000000000000..55ae31277c6cd --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/test_ers_nested_loops.ll @@ -0,0 +1,208 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -stop-after=amdgpu-early-register-spilling -verify-machineinstrs -print-after=amdgpu-early-register-spilling -max-vgprs=13 < %s 2>&1 | FileCheck %s + +; +; bb.0.entry +; | +; bb.1.loop1.header<-------+ +; | | +; bb.2.loop2.header<---+ | +; | | | +; bb.3.loop3<--+ | | +; | | | | +; +--------+ | | +; | | | +; bb.4.loop2.latch-----+ | +; | | +; bb.5.loop1.latch---------+ +; | +; bb.6.exit +; +define amdgpu_ps i32 @test(ptr addrspace(1) %p1, ptr addrspace(1) %p2, ptr addrspace(1) %p3, ptr addrspace(1) %p4, ptr addrspace(1) %p5, i32 %TC1, i32 %TC2, i32 %TC3) { + ; CHECK-LABEL: name: test + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr7 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr6 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[COPY]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY10]], %subreg.sub1 + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE4]], 0, 0, implicit $exec :: (load (s8) from %ir.p1, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE4]], 1, 0, implicit $exec :: (load (s8) from %ir.p1 + 1, addrspace 1) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[COPY1]], %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE1]], 8, [[GLOBAL_LOAD_UBYTE]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE2:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE4]], 2, 0, implicit $exec :: (load (s8) from %ir.p1 + 2, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE3:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE4]], 3, 0, implicit $exec :: (load (s8) from %ir.p1 + 3, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE3]], 8, [[GLOBAL_LOAD_UBYTE2]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_1]], 16, [[V_LSHL_OR_B32_e64_]], implicit $exec + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE1:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.loop1.header: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %28, %bb.5 + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %27, %bb.5 + ; CHECK-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[V_LSHL_OR_B32_e64_2]], %bb.0, %26, %bb.5 + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 13, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.loop2.header: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI3:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.1, %19, %bb.4 + ; CHECK-NEXT: [[PHI4:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.1, %18, %bb.4 + ; CHECK-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_]], %bb.1, %17, %bb.4 + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE4:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (load (s8) from %ir.p2, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE5:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE3]], 1, 0, implicit $exec :: (load (s8) from %ir.p2 + 1, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_3:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE5]], 8, [[GLOBAL_LOAD_UBYTE4]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE6:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE3]], 2, 0, implicit $exec :: (load (s8) from %ir.p2 + 2, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE7:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE3]], 3, 0, implicit $exec :: (load (s8) from %ir.p2 + 3, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_4:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE7]], 8, [[GLOBAL_LOAD_UBYTE6]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_5:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_4]], 16, [[V_LSHL_OR_B32_e64_3]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI4]], [[V_LSHL_OR_B32_e64_5]], 0, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_BYTE_D16_HI [[REG_SEQUENCE1]], [[V_ADD_U32_e64_]], 2, 0, implicit $exec :: (store (s8) into %ir.p4 + 2, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_BYTE [[REG_SEQUENCE1]], [[V_ADD_U32_e64_]], 0, 0, implicit $exec :: (store (s8) into %ir.p4, addrspace 1) + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 24, [[V_ADD_U32_e64_]], implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_BYTE [[REG_SEQUENCE1]], [[V_LSHRREV_B32_e64_]], 3, 0, implicit $exec :: (store (s8) into %ir.p4 + 3, addrspace 1) + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 8, [[V_ADD_U32_e64_]], implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_BYTE [[REG_SEQUENCE1]], [[V_LSHRREV_B32_e64_1]], 1, 0, implicit $exec :: (store (s8) into %ir.p4 + 1, addrspace 1) + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3.loop3: + ; CHECK-NEXT: successors: %bb.4(0x04000000), %bb.3(0x7c000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI6:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_1]], %bb.2, %12, %bb.3 + ; CHECK-NEXT: [[PHI7:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_1]], %bb.2, %10, %bb.3 + ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI7]], 3, implicit-def dead $scc + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE8:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE2]], 0, 0, implicit $exec :: (load (s8) from %ir.p3, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE9:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE2]], 1, 0, implicit $exec :: (load (s8) from %ir.p3 + 1, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_6:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE9]], 8, [[GLOBAL_LOAD_UBYTE8]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE10:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE2]], 2, 0, implicit $exec :: (load (s8) from %ir.p3 + 2, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE11:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE2]], 3, 0, implicit $exec :: (load (s8) from %ir.p3 + 3, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_7:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE11]], 8, [[GLOBAL_LOAD_UBYTE10]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_8:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_7]], 16, [[V_LSHL_OR_B32_e64_6]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[S_ADD_I32_]], [[V_LSHL_OR_B32_e64_8]], 0, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_BYTE_D16_HI [[REG_SEQUENCE]], [[V_ADD_U32_e64_1]], 2, 0, implicit $exec :: (store (s8) into %ir.p5 + 2, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_BYTE [[REG_SEQUENCE]], [[V_ADD_U32_e64_1]], 0, 0, implicit $exec :: (store (s8) into %ir.p5, addrspace 1) + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 24, [[V_ADD_U32_e64_1]], implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_BYTE [[REG_SEQUENCE]], [[V_LSHRREV_B32_e64_2]], 3, 0, implicit $exec :: (store (s8) into %ir.p5 + 3, addrspace 1) + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_3:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 8, [[V_ADD_U32_e64_1]], implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_BYTE [[REG_SEQUENCE]], [[V_LSHRREV_B32_e64_3]], 1, 0, implicit $exec :: (store (s8) into %ir.p5 + 1, addrspace 1) + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 [[S_ADD_I32_]], [[SI_SPILL_V32_RESTORE1]], implicit $exec + ; CHECK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[V_CMP_GE_U32_e64_]], [[PHI6]], implicit-def dead $scc + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]], implicit $exec + ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4.loop2.latch: + ; CHECK-NEXT: successors: %bb.5(0x04000000), %bb.2(0x7c000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_LSHL_OR_B32_e64_8]], [[COPY12]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[PHI2]], [[V_ADD_U32_e64_2]], implicit $exec + ; CHECK-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI4]], 2, implicit-def dead $scc + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 [[S_ADD_I32_1]], [[SI_SPILL_V32_RESTORE]], implicit $exec + ; CHECK-NEXT: [[SI_IF_BREAK1:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[V_CMP_GE_U32_e64_1]], [[PHI3]], implicit-def dead $scc + ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK1]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5.loop1.latch: + ; CHECK-NEXT: successors: %bb.6(0x04000000), %bb.1(0x7c000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_LSHL_OR_B32_e64_5]], [[COPY12]], 0, implicit $exec + ; CHECK-NEXT: [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI1]], 1, implicit-def dead $scc + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_2:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 [[S_ADD_I32_2]], [[SI_SPILL_V32_RESTORE1]], implicit $exec + ; CHECK-NEXT: [[SI_IF_BREAK2:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[V_CMP_GE_U32_e64_2]], [[PHI]], implicit-def dead $scc + ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK2]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.6 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.6.exit: + ; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK2]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[V_MAX_U32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_U32_e64 1, [[SI_SPILL_V32_RESTORE1]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MAX_U32_e64_]], [[V_LSHL_OR_B32_e64_2]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 [[PHI5]], [[PHI2]], [[V_ADD_U32_e64_2]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_2]], [[V_MUL_LO_U32_e64_]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD3_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 [[V_ADD_U32_e64_4]], [[V_ADD3_U32_e64_]], [[PHI5]], implicit $exec + ; CHECK-NEXT: [[V_ADD3_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 [[V_ADD_U32_e64_3]], [[V_ADD3_U32_e64_1]], -1, implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_ADD3_U32_e64_2]], implicit $exec + ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; CHECK-NEXT: SI_RETURN_TO_EPILOG killed $sgpr0 +entry: +; entry +; | +; loop1.header<-------+ +; | | +; loop2.header<-----+ | +; | | | +; loop3<-------+ | | +; +----------+ | | +; | | | +; loop2.latch-------+ | +; | | +; loop1.latch---------+ +; | +; exit + %ld1 = load i32, ptr addrspace(1) %p1, align 1 + br label %loop1.header + +loop1.header: + %phi.inc1 = phi i32 [ 0, %entry ], [ %inc1, %loop1.latch ] + %phi1 = phi i32 [ %ld1, %entry ], [ %sub, %loop1.latch ] + %add1 = add i32 %ld1, %phi.inc1 + br label %loop2.header + +loop2.header: + %phi.inc2 = phi i32 [ 0, %loop1.header ], [ %inc2, %loop2.latch ] + %phi2 = phi i32 [ 13, %loop1.header ], [ %mul, %loop2.latch ] + %ld2 = load i32, ptr addrspace(1) %p2, align 1 + %add2 = add i32 %ld2, %phi.inc2 + store i32 %add2, ptr addrspace(1) %p4, align 1 + br label %loop3 + +loop3: + %phi.inc3 = phi i32 [ 0, %loop2.header ], [ %inc3, %loop3 ] + %inc3 = add i32 %phi.inc3, 3 + %sub = sub i32 %ld2, %inc3 + %ld3 = load i32, ptr addrspace(1) %p3, align 1 + %add3 = add i32 %ld3, %inc3 + store i32 %add3, ptr addrspace(1) %p5, align 1 + %cond3 = icmp ult i32 %inc3, %TC1 + br i1 %cond3, label %loop3, label %loop2.latch + +loop2.latch: + %mul = mul i32 %phi1, %add3 + %inc2 = add i32 %phi.inc2, 2 + %cond2 = icmp ult i32 %inc2, %TC2 + br i1 %cond2, label %loop2.header, label %loop1.latch + +loop1.latch: + %add4 = add i32 %phi2, %phi1 + %add5 = add i32 %add3, %add4 + %inc1 = add i32 %phi.inc1, 1 + %cond1 = icmp ult i32 %inc1, %TC1 + br i1 %cond1, label %loop1.header, label %exit + +exit: + %add6 = add i32 %add3, %mul + %add7 = add i32 %add6, %add5 + %add8 = add i32 %add7, %phi2 + %add9 = add i32 %add8, %add1 + ret i32 %add9 +} diff --git a/llvm/test/CodeGen/AMDGPU/test_ers_spill_in_common_dominator_and_optimize_restores.ll b/llvm/test/CodeGen/AMDGPU/test_ers_spill_in_common_dominator_and_optimize_restores.ll new file mode 100644 index 0000000000000..8f2b9834ed908 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/test_ers_spill_in_common_dominator_and_optimize_restores.ll @@ -0,0 +1,260 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -stop-after=amdgpu-early-register-spilling -verify-machineinstrs -print-after=amdgpu-early-register-spilling -max-vgprs=20 < %s 2>&1 | FileCheck %s + +@array1 = global [5 x i32] zeroinitializer, align 4 +@array2 = global [5 x i32] zeroinitializer, align 4 +@array3 = global [5 x i32] zeroinitializer, align 4 +@array4 = global [5 x i32] zeroinitializer, align 4 + +; bb.0.entry +; / | +; bb.1.bb1 | +; \ | +; bb.2.bb2 +; +define amdgpu_ps void @test(ptr addrspace(1) %p1, ptr addrspace(3) %p2, i1 %cond1, ptr addrspace(1) %p3, ptr addrspace(1) %p4, ptr addrspace(1) %p5, i32 %arg1, i32 %arg2) { + ; CHECK-LABEL: name: test + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr7 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr6 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY10]], %subreg.sub1 + ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 1, [[COPY8]], implicit $exec + ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 1, [[V_AND_B32_e64_]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (load (s8) from %ir.p1, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE3]], 1, 0, implicit $exec :: (load (s8) from %ir.p1 + 1, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE1]], 8, [[GLOBAL_LOAD_UBYTE]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE2:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE3]], 2, 0, implicit $exec :: (load (s8) from %ir.p1 + 2, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE3:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE3]], 3, 0, implicit $exec :: (load (s8) from %ir.p1 + 3, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE3]], 8, [[GLOBAL_LOAD_UBYTE2]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_1]], 16, [[V_LSHL_OR_B32_e64_]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE4:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE3]], 12, 0, implicit $exec :: (load (s8) from %ir.gep1, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE5:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE3]], 13, 0, implicit $exec :: (load (s8) from %ir.gep1 + 1, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_3:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE5]], 8, [[GLOBAL_LOAD_UBYTE4]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE6:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE3]], 14, 0, implicit $exec :: (load (s8) from %ir.gep1 + 2, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE7:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE3]], 15, 0, implicit $exec :: (load (s8) from %ir.gep1 + 3, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_4:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE7]], 8, [[GLOBAL_LOAD_UBYTE6]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_5:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_4]], 16, [[V_LSHL_OR_B32_e64_3]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE2]], 16, 0, implicit $exec :: (load (s128) from %ir.p3 + 16, align 4, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_1:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[REG_SEQUENCE2]], 0, 0, implicit $exec :: (load (s128) from %ir.p3, align 4, addrspace 1) + ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[GLOBAL_LOAD_DWORDX4_1]].sub0, [[V_LSHL_OR_B32_e64_2]], 0, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[V_ADD_U32_e64_]], 0, 0, implicit $exec :: (store (s32) into %ir.p3, addrspace 1) + ; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], [[V_LSHL_OR_B32_e64_2]], 0, implicit $exec + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array1, target-flags(amdgpu-gotprel32-hi) @array1, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] + ; CHECK-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY12]], 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array1, i64 20)`) + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET1:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array3, target-flags(amdgpu-gotprel32-hi) @array3, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET1]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM1]] + ; CHECK-NEXT: FLAT_STORE_DWORD [[COPY13]], [[FLAT_LOAD_DWORD]], 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr getelementptr inbounds nuw (i8, ptr @array3, i64 4)`) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE8:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 20, 0, implicit $exec :: (load (s8) from %ir.p4 + 20, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE9:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 21, 0, implicit $exec :: (load (s8) from %ir.p4 + 21, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_6:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE9]], 8, [[GLOBAL_LOAD_UBYTE8]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE10:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 22, 0, implicit $exec :: (load (s8) from %ir.p4 + 22, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE11:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 23, 0, implicit $exec :: (load (s8) from %ir.p4 + 23, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_7:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE11]], 8, [[GLOBAL_LOAD_UBYTE10]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_8:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_7]], 16, [[V_LSHL_OR_B32_e64_6]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE12:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 12, 0, implicit $exec :: (load (s8) from %ir.p4 + 12, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE13:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 13, 0, implicit $exec :: (load (s8) from %ir.p4 + 13, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_9:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE13]], 8, [[GLOBAL_LOAD_UBYTE12]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE14:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 14, 0, implicit $exec :: (load (s8) from %ir.p4 + 14, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE15:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 15, 0, implicit $exec :: (load (s8) from %ir.p4 + 15, addrspace 1) + ; CHECK-NEXT: SI_SPILL_V64_SAVE [[REG_SEQUENCE]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_10:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE15]], 8, [[GLOBAL_LOAD_UBYTE14]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_11:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_10]], 16, [[V_LSHL_OR_B32_e64_9]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s16) from %ir.p4, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[REG_SEQUENCE1]], 2, 0, implicit $exec :: (load (s16) from %ir.p4 + 2, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_12:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_USHORT1]], 16, [[GLOBAL_LOAD_USHORT]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_LSHL_OR_B32_e64_12]], [[V_ADD_U32_e64_]], 0, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE1]], [[V_ADD_U32_e64_2]], 0, 0, implicit $exec :: (store (s32) into %ir.p4, addrspace 1) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[COPY]], %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[V_ADD_U32_e64_]], %stack.2, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[V_LSHL_OR_B32_e64_8]], %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[V_LSHL_OR_B32_e64_11]], %stack.4, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5) + ; CHECK-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.bb1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DS_READ_U8_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY9]], 0, 0, implicit $exec :: (load (s8) from %ir.p2, addrspace 3) + ; CHECK-NEXT: [[DS_READ_U8_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY9]], 1, 0, implicit $exec :: (load (s8) from %ir.p2 + 1, addrspace 3) + ; CHECK-NEXT: [[DS_READ_U8_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY9]], 2, 0, implicit $exec :: (load (s8) from %ir.p2 + 2, addrspace 3) + ; CHECK-NEXT: [[DS_READ_U8_gfx9_3:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY9]], 3, 0, implicit $exec :: (load (s8) from %ir.p2 + 3, addrspace 3) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_13:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[DS_READ_U8_gfx9_1]], 8, [[DS_READ_U8_gfx9_]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_14:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[DS_READ_U8_gfx9_3]], 8, [[DS_READ_U8_gfx9_2]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_15:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_14]], 16, [[V_LSHL_OR_B32_e64_13]], implicit $exec + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] + ; CHECK-NEXT: [[FLAT_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY14]], 28, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array1, i64 28)`) + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET2:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array2, target-flags(amdgpu-gotprel32-hi) @array2, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET2]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM2]] + ; CHECK-NEXT: FLAT_STORE_DWORD [[COPY15]], [[FLAT_LOAD_DWORD1]], 68, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr getelementptr inbounds nuw (i8, ptr @array2, i64 68)`) + ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET3:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @array4, target-flags(amdgpu-gotprel32-hi) @array4, implicit-def dead $scc + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM3:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET3]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM3]] + ; CHECK-NEXT: [[FLAT_LOAD_DWORD2:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY16]], 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array4, i64 20)`) + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM1]] + ; CHECK-NEXT: FLAT_STORE_DWORD [[COPY17]], [[FLAT_LOAD_DWORD2]], 60, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr getelementptr inbounds nuw (i8, ptr @array3, i64 60)`) + ; CHECK-NEXT: [[FLAT_LOAD_DWORD3:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY17]], 84, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr getelementptr inbounds nuw (i8, ptr @array3, i64 84)`) + ; CHECK-NEXT: [[FLAT_LOAD_DWORD4:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY17]], 0, 0, implicit $exec, implicit $flat_scr :: (dereferenceable load (s32) from @array3) + ; CHECK-NEXT: [[FLAT_LOAD_DWORD5:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY14]], 0, 0, implicit $exec, implicit $flat_scr :: (dereferenceable load (s32) from @array1) + ; CHECK-NEXT: [[FLAT_LOAD_DWORD6:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY15]], 0, 0, implicit $exec, implicit $flat_scr :: (dereferenceable load (s32) from @array2) + ; CHECK-NEXT: [[FLAT_LOAD_DWORD7:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY16]], 0, 0, implicit $exec, implicit $flat_scr :: (dereferenceable load (s32) from @array4) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[FLAT_LOAD_DWORD6]], [[FLAT_LOAD_DWORD7]], implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MUL_LO_U32_e64_]], %subreg.sub0, undef %261:vgpr_32, %subreg.sub1 + ; CHECK-NEXT: %222:vreg_64, $sgpr_null = V_MAD_U64_U32_e64 [[FLAT_LOAD_DWORD4]], [[FLAT_LOAD_DWORD5]], [[REG_SEQUENCE4]], 0, implicit $exec + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY %222.sub0 + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_1]].sub1 + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_1]].sub2 + ; CHECK-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub2 + ; CHECK-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.bb2: + ; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_LSHL_OR_B32_e64_2]], %bb.0, [[FLAT_LOAD_DWORD1]], %bb.1 + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[FLAT_LOAD_DWORD]], %bb.0, [[FLAT_LOAD_DWORD2]], %bb.1 + ; CHECK-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[FLAT_LOAD_DWORD]], %bb.0, [[FLAT_LOAD_DWORD3]], %bb.1 + ; CHECK-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[FLAT_LOAD_DWORD]], %bb.0, [[FLAT_LOAD_DWORD4]], %bb.1 + ; CHECK-NEXT: [[PHI4:%[0-9]+]]:vgpr_32 = PHI [[FLAT_LOAD_DWORD]], %bb.0, [[FLAT_LOAD_DWORD5]], %bb.1 + ; CHECK-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[FLAT_LOAD_DWORD]], %bb.0, [[FLAT_LOAD_DWORD6]], %bb.1 + ; CHECK-NEXT: [[PHI6:%[0-9]+]]:vgpr_32 = PHI [[FLAT_LOAD_DWORD]], %bb.0, [[FLAT_LOAD_DWORD7]], %bb.1 + ; CHECK-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI [[FLAT_LOAD_DWORD]], %bb.0, [[COPY18]], %bb.1 + ; CHECK-NEXT: [[PHI8:%[0-9]+]]:vgpr_32 = PHI [[FLAT_LOAD_DWORD]], %bb.0, [[COPY19]], %bb.1 + ; CHECK-NEXT: [[PHI9:%[0-9]+]]:vgpr_32 = PHI [[FLAT_LOAD_DWORD]], %bb.0, [[COPY20]], %bb.1 + ; CHECK-NEXT: [[PHI10:%[0-9]+]]:vgpr_32 = PHI [[FLAT_LOAD_DWORD]], %bb.0, [[COPY21]], %bb.1 + ; CHECK-NEXT: [[PHI11:%[0-9]+]]:vgpr_32 = PHI [[FLAT_LOAD_DWORD]], %bb.0, [[COPY22]], %bb.1 + ; CHECK-NEXT: [[PHI12:%[0-9]+]]:vgpr_32 = PHI [[V_LSHL_OR_B32_e64_5]], %bb.0, [[V_LSHL_OR_B32_e64_2]], %bb.1 + ; CHECK-NEXT: [[PHI13:%[0-9]+]]:vgpr_32 = PHI [[FLAT_LOAD_DWORD]], %bb.0, [[COPY1]], %bb.1 + ; CHECK-NEXT: [[PHI14:%[0-9]+]]:vgpr_32 = PHI [[V_ADD_U32_e64_1]], %bb.0, [[V_LSHL_OR_B32_e64_15]], %bb.1 + ; CHECK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5) + ; CHECK-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI]], [[SI_SPILL_V32_RESTORE]], 0, implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE1:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_ADD_U32_e64_3]], [[SI_SPILL_V32_RESTORE1]], implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_MUL_LO_U32_e64_1]], [[FLAT_LOAD_DWORD]], 0, implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_1:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_SUB_U32_e64_]], [[PHI1]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_SUB_U32_e64_1]], [[PHI2]], 0, implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE2:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; CHECK-NEXT: [[V_SUB_U32_e64_2:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_ADD_U32_e64_4]], [[SI_SPILL_V32_RESTORE2]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_SUB_U32_e64_2]], [[PHI3]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_MUL_LO_U32_e64_2]], [[PHI4]], implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_3:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_MUL_LO_U32_e64_3]], [[PHI5]], 0, implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_SUB_U32_e64_3]], [[PHI6]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_4:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_ADD_U32_e64_5]], [[PHI7]], implicit $exec + ; CHECK-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 [[V_MUL_LO_U32_e64_4]], [[PHI8]], [[PHI9]], implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_4:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_ADD3_U32_e64_]], [[PHI10]], 0, implicit $exec + ; CHECK-NEXT: [[V_SUB_U32_e64_5:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_SUB_U32_e64_4]], [[PHI11]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_5:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_SUB_U32_e64_5]], [[PHI12]], implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[PHI14]], %subreg.sub0, undef %259:vgpr_32, %subreg.sub1 + ; CHECK-NEXT: %244:vreg_64, $sgpr_null = V_MAD_U64_U32_e64 [[V_MUL_LO_U32_e64_5]], [[PHI13]], [[REG_SEQUENCE5]], 0, implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE3:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) + ; CHECK-NEXT: [[V_SUB_U32_e64_6:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 %244.sub0, [[SI_SPILL_V32_RESTORE3]], 0, implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: GLOBAL_STORE_SHORT_D16_HI [[SI_SPILL_V64_RESTORE]], [[V_SUB_U32_e64_6]], 2, 0, implicit $exec :: (store (s16) into %ir.p5 + 2, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_SHORT [[SI_SPILL_V64_RESTORE]], [[V_SUB_U32_e64_6]], 0, 0, implicit $exec :: (store (s16) into %ir.p5, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 +entry: + %ld1 = load i32, ptr addrspace(1) %p1, align 1 + %gep1 = getelementptr inbounds i32, ptr addrspace(1) %p1, i64 3 + %ld2 = load i32, ptr addrspace(1) %gep1, align 1 + %load1 = load i32, ptr addrspace(1) %p3, align 4 + %tmp1 = add i32 %load1, %ld1 + %load2 = load <8 x i32>, ptr addrspace(1) %p3, align 1 + store i32 %tmp1, ptr addrspace(1) %p3 + %add1 = add i32 %ld1, %tmp1 + %idx10 = getelementptr inbounds [5 x i32], [5 x i32]* @array1, i64 1, i64 0 + %val0 = load i32, i32* %idx10, align 4 + %idx20 = getelementptr inbounds [5 x i32], [5 x i32]* @array3, i64 0, i64 1 + store i32 %val0, i32 *%idx20 + %load3 = load <8 x i32>, ptr addrspace(1) %p4, align 1 + %load4 = load i32, ptr addrspace(1) %p4, align 2 + %tmp2 = add i32 %load4, %tmp1 + store i32 %tmp2, ptr addrspace(1) %p4 + br i1 %cond1, label %bb1, label %bb2 + +bb1: + %ld3 = load i32, ptr addrspace(3) %p2, align 1 + %idx12 = getelementptr inbounds [5 x i32], [5 x i32]* @array1, i64 1, i64 2 + %val2 = load i32, i32* %idx12, align 4 + %idx22 = getelementptr inbounds [5 x i32], [5 x i32]* @array2, i64 3, i64 2 + store i32 %val2, i32 *%idx22 + %idx13 = getelementptr inbounds [5 x i32], [5 x i32]* @array4, i64 1, i64 0 + %val3 = load i32, i32* %idx13, align 4 + %idx23 = getelementptr inbounds [5 x i32], [5 x i32]* @array3, i64 3, i64 0 + store i32 %val3, i32 *%idx23 + %idx14 = getelementptr inbounds [5 x i32], [5 x i32]* @array3, i64 4, i64 1 + %val4 = load i32, i32* %idx14, align 4 + %idx24 = getelementptr inbounds [5 x i32], [5 x i32]* @array4, i64 3, i64 0 + %idx15 = getelementptr inbounds [5 x i32], [5 x i32]* @array3, i64 0, i64 0 + %val5 = load i32, i32* %idx15, align 4 + %idx16 = getelementptr inbounds [5 x i32], [5 x i32]* @array1, i64 0, i64 0 + %val6 = load i32, i32* %idx16, align 4 + %idx17 = getelementptr inbounds [5 x i32], [5 x i32]* @array2, i64 0, i64 0 + %val7 = load i32, i32* %idx17, align 4 + %idx18 = getelementptr inbounds [5 x i32], [5 x i32]* @array4, i64 0, i64 0 + %val8 = load i32, i32* %idx18, align 4 + %mul10 = mul i32 %val5, %val6 + %mul11 = mul i32 %val7, %val8 + %add100 = add i32 %mul10, %mul11 + %extract1 = extractelement < 8 x i32> %load2, i32 1 + %extract2 = extractelement < 8 x i32> %load2, i32 2 + %extract3 = extractelement < 8 x i32> %load2, i32 6 + %extract4 = extractelement < 8 x i32> %load2, i32 7 + br label %bb2 + +bb2: + %phi1 = phi i32 [ %ld1, %entry ], [ %val2, %bb1 ] + %phi2 = phi i32 [ %val0, %entry ], [ %val3, %bb1 ] + %phi3 = phi i32 [ %val0, %entry ], [ %val4, %bb1 ] + %phi4 = phi i32 [ %val0, %entry ], [ %val5, %bb1 ] + %phi5 = phi i32 [ %val0, %entry ], [ %val6, %bb1 ] + %phi6 = phi i32 [ %val0, %entry ], [ %val7, %bb1 ] + %phi7 = phi i32 [ %val0, %entry ], [ %val8, %bb1 ] + %phi8 = phi i32 [ %val0, %entry ], [ %add100, %bb1 ] + %phi9 = phi i32 [ %val0, %entry ], [ %extract1, %bb1 ] + %phi10 = phi i32 [ %val0, %entry ], [ %extract2, %bb1 ] + %phi11 = phi i32 [ %val0, %entry ], [ %extract3, %bb1 ] + %phi12 = phi i32 [ %val0, %entry ], [ %extract4, %bb1 ] + %phi13 = phi i32 [ %ld2, %entry ], [ %ld1, %bb1 ] + %phi14 = phi i32 [ %val0, %entry ], [ %arg1, %bb1 ] + %phi15 = phi i32 [ %add1, %entry ], [ %ld3, %bb1 ] + %extract5 = extractelement < 8 x i32> %load3, i32 3 + %extract6 = extractelement < 8 x i32> %load3, i32 5 + %res1 = add i32 %phi1, %extract5 + %res2 = mul i32 %res1, %extract6 + %res3 = sub i32 %res2, %val0 + %res4 = sub i32 %res3, %phi2 + %res5 = add i32 %res4, %phi3 + %res6 = sub i32 %res5, %tmp1 + %res7 = mul i32 %res6, %phi4 + %res8 = mul i32 %res7, %phi5 + %res9 = sub i32 %res8, %phi6 + %res10 = add i32 %res9, %phi7 + %res11 = mul i32 %res10, %phi8 + %res12 = add i32 %res11, %phi9 + %res13 = add i32 %res12, %phi10 + %res14 = sub i32 %res13, %phi11 + %res15 = sub i32 %res14, %phi12 + %res16 = mul i32 %res15, %phi13 + %res17 = mul i32 %res16, %phi14 + %res18 = add i32 %res17, %phi15 + %res19 = sub i32 %res18, %arg2 + store i32 %res19, ptr addrspace(1) %p5, align 2 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/test_ers_spill_loop_livethrough_reg.ll b/llvm/test/CodeGen/AMDGPU/test_ers_spill_loop_livethrough_reg.ll new file mode 100644 index 0000000000000..3f734ae81d6e5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/test_ers_spill_loop_livethrough_reg.ll @@ -0,0 +1,175 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -stop-after=amdgpu-early-register-spilling -verify-machineinstrs -print-after=amdgpu-early-register-spilling -max-vgprs=10 < %s 2>&1 | FileCheck %s + +; +; bb.0.entry +; | +; bb.1.loop.header<--+ +; / | | +; bb.2.bb1 | | +; \ | | +; bb.5.Flow | +; / | | +; bb.6.bb3 | | +; \ | | +; bb.3.Flow1 | +; / | | +; bb.4.bb2 | | +; \ | | +; bb.7.loop.latch----+ +; | +; bb.8.exit +; +define amdgpu_ps void @test(ptr addrspace(1) %p1, ptr addrspace(1) %p2, ptr addrspace(1) %p3, i32 %TC) { + ; CHECK-LABEL: name: test + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr6 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE2]], 0, 0, implicit $exec :: (load (s8) from %ir.p1, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE2]], 1, 0, implicit $exec :: (load (s8) from %ir.p1 + 1, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE1]], 8, [[GLOBAL_LOAD_UBYTE]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE2:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE2]], 2, 0, implicit $exec :: (load (s8) from %ir.p1 + 2, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE3:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE2]], 3, 0, implicit $exec :: (load (s8) from %ir.p1 + 3, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE3]], 8, [[GLOBAL_LOAD_UBYTE2]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_1]], 16, [[V_LSHL_OR_B32_e64_]], implicit $exec + ; CHECK-NEXT: SI_SPILL_V64_SAVE [[REG_SEQUENCE]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.loop.header: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %19, %bb.7 + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:vreg_64 = PHI undef %32:vreg_64, %bb.0, %12, %bb.7 + ; CHECK-NEXT: [[PHI2:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %18, %bb.7 + ; CHECK-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[V_LSHL_OR_B32_e64_2]], %bb.0, %17, %bb.7 + ; CHECK-NEXT: [[V_CMP_GE_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GE_I32_e64 [[PHI2]], [[V_LSHL_OR_B32_e64_2]], implicit $exec + ; CHECK-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_I32_e64 [[PHI2]], [[V_LSHL_OR_B32_e64_2]], implicit $exec + ; CHECK-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.bb1: + ; CHECK-NEXT: successors: %bb.5(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[S_ASHR_I32_:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[PHI2]], 31, implicit-def dead $scc + ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[PHI2]], %subreg.sub0, [[S_ASHR_I32_]], %subreg.sub1 + ; CHECK-NEXT: [[S_LSHL_B64_:%[0-9]+]]:sreg_64 = nsw S_LSHL_B64 [[REG_SEQUENCE3]], 2, implicit-def dead $scc + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[REG_SEQUENCE1]].sub0, [[S_LSHL_B64_]].sub0, 0, implicit $exec + ; CHECK-NEXT: %92:vgpr_32, dead $sgpr_null = V_ADDC_U32_e64 [[S_LSHL_B64_]].sub1, [[REG_SEQUENCE1]].sub1, [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %92, %subreg.sub1 + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE4]], 0, 0, implicit $exec :: (load (s32) from %ir.gep, addrspace 1) + ; CHECK-NEXT: [[V_CMP_LE_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LE_I32_e64 [[GLOBAL_LOAD_DWORD]], [[V_LSHL_OR_B32_e64_2]], implicit $exec + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $exec_lo + ; CHECK-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[V_CMP_GE_I32_e64_]], $exec_lo, implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_LE_I32_e64_]], $exec_lo, implicit-def dead $scc + ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_ANDN2_B32_]], [[S_AND_B32_]], implicit-def dead $scc + ; CHECK-NEXT: S_BRANCH %bb.5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3.Flow1: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.7(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI4:%[0-9]+]]:sreg_32 = PHI %13, %bb.5, %82, %bb.6 + ; CHECK-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI undef %61:vgpr_32, %bb.5, %16, %bb.6 + ; CHECK-NEXT: SI_END_CF %15, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF [[PHI4]], %bb.7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4.bb2: + ; CHECK-NEXT: successors: %bb.7(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: GLOBAL_STORE_DWORD %12, [[PHI3]], 0, 0, implicit $exec :: (store (s32) into %ir.9, addrspace 1) + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5.Flow: + ; CHECK-NEXT: successors: %bb.6(0x40000000), %bb.3(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI6:%[0-9]+]]:sreg_32 = PHI [[V_CMP_GE_I32_e64_]], %bb.1, [[S_OR_B32_]], %bb.2 + ; CHECK-NEXT: [[PHI7:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.1, [[COPY7]], %bb.2 + ; CHECK-NEXT: [[PHI8:%[0-9]+]]:vreg_64 = PHI [[PHI1]], %bb.1, [[REG_SEQUENCE4]], %bb.2 + ; CHECK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[SI_IF2:%[0-9]+]]:sreg_32 = SI_IF [[PHI6]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.6 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.6.bb3: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 31, [[PHI3]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI3]], [[V_LSHRREV_B32_e64_]], 0, implicit $exec + ; CHECK-NEXT: [[V_ASHRREV_I32_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I32_e64 1, [[V_ADD_U32_e64_]], implicit $exec + ; CHECK-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[PHI7]], $exec_lo, implicit-def dead $scc + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[S_ANDN2_B32_1]] + ; CHECK-NEXT: S_BRANCH %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.7.loop.latch: + ; CHECK-NEXT: successors: %bb.8(0x04000000), %bb.1(0x7c000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI9:%[0-9]+]]:vgpr_32 = PHI [[PHI5]], %bb.3, [[V_MOV_B32_e32_]], %bb.4 + ; CHECK-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI2]], 1, implicit-def dead $scc + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 [[S_ADD_I32_]], [[COPY]], implicit $exec + ; CHECK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[V_CMP_GE_U32_e64_]], [[PHI]], implicit-def dead $scc + ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.8.exit: + ; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[SI_SPILL_V64_RESTORE]], [[PHI9]], 0, 0, implicit $exec :: (store (s32) into %ir.p3, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 +entry: +; entry +; | +; loop.header<-+ +; | | | +; bb1 | | +; | \ | | +; bb2 bb3 | +; | | | +; loop.latch---+ +; | +; exit + %ld1 = load i32, ptr addrspace(1) %p1, align 1 + br label %loop.header + +loop.header: + %phi.inc = phi i32 [ 0, %entry ], [ %inc, %loop.latch ] + %phi1 = phi i32 [ %ld1, %entry ], [ %phi2, %loop.latch ] + %cond1 = icmp slt i32 %phi.inc, %ld1 + br i1 %cond1, label %bb1, label %bb3 + +bb1: + %sext = sext i32 %phi.inc to i64 + %gep = getelementptr inbounds i32, ptr addrspace(1) %p2, i64 %sext + %ld2 = load i32, ptr addrspace(1) %gep, align 4 + %cond2 = icmp sgt i32 %ld2, %ld1 + br i1 %cond2, label %bb2, label %bb3 + +bb2: + store i32 %phi1, ptr addrspace(1) %gep, align 4 + br label %loop.latch + +bb3: + %div = sdiv i32 %phi1, 2 + br label %loop.latch + +loop.latch: + %phi2 = phi i32 [ 1, %bb2 ], [ %div, %bb3 ] + %inc = add i32 %phi.inc, 1 + %cond3 = icmp ult i32 %inc, %TC + br i1 %cond3, label %loop.header, label %exit + +exit: + store i32 %phi2, ptr addrspace(1) %p3, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/test_ers_spill_loop_value_in_exit_block.ll b/llvm/test/CodeGen/AMDGPU/test_ers_spill_loop_value_in_exit_block.ll new file mode 100644 index 0000000000000..4746d5cf3d3ce --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/test_ers_spill_loop_value_in_exit_block.ll @@ -0,0 +1,137 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -stop-after=amdgpu-early-register-spilling -verify-machineinstrs -print-after=amdgpu-early-register-spilling -max-vgprs=10 < %s 2>&1 | FileCheck %s + +; +; bb.0.entry +; | +; +<-----+ +; bb.1.loop | +; +------+ +; | +; bb.2.exit +; +define amdgpu_ps void @test(ptr addrspace(1) %p1, ptr addrspace(1) %p2, ptr addrspace(1) %p3, ptr addrspace(1) %p4, ptr addrspace(1) %p5, i32 %TC) { + ; CHECK-LABEL: name: test + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr7 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr6 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[COPY]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[COPY8]], %stack.2, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK-NEXT: SI_SPILL_V64_SAVE [[REG_SEQUENCE]], %stack.1, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1 + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; CHECK-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[SI_SPILL_V32_RESTORE]], %subreg.sub0, [[COPY7]], %subreg.sub1 + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE4]], 0, 0, implicit $exec :: (load (s32) from %ir.p2, addrspace 1) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE1:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE4]], [[SI_SPILL_V32_RESTORE1]], 0, 0, implicit $exec :: (store (s32) into %ir.p2, addrspace 1) + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.loop: + ; CHECK-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_1]], %bb.0, %5, %bb.1 + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %3, %bb.1 + ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI1]], 1, implicit-def dead $scc + ; CHECK-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI1]], 2, implicit-def dead $scc + ; CHECK-NEXT: [[V_CMP_GE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GE_U32_e64 [[S_ADD_I32_1]], [[GLOBAL_LOAD_DWORD]], implicit $exec + ; CHECK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[V_CMP_GE_U32_e64_]], [[PHI]], implicit-def dead $scc + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]], implicit $exec + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_1]], implicit $exec + ; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.exit: + ; CHECK-NEXT: SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[GLOBAL_LOAD_DWORD]], %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) + ; CHECK-NEXT: [[V_ASHRREV_I32_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I32_e64 31, [[COPY11]], implicit $exec + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[COPY12]], %stack.4, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V64_SAVE [[REG_SEQUENCE1]], %stack.5, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.5, align 4, addrspace 5) + ; CHECK-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[V_ASHRREV_I32_e64_]], %subreg.sub1 + ; CHECK-NEXT: [[V_LSHLREV_B64_pseudo_e64_:%[0-9]+]]:vreg_64 = nsw V_LSHLREV_B64_pseudo_e64 2, [[REG_SEQUENCE5]], implicit $exec + ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[REG_SEQUENCE3]].sub0, [[V_LSHLREV_B64_pseudo_e64_]].sub0, 0, implicit $exec + ; CHECK-NEXT: %83:vgpr_32, dead $sgpr_null = V_ADDC_U32_e64 [[REG_SEQUENCE3]].sub1, [[V_LSHLREV_B64_pseudo_e64_]].sub1, [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %83, %subreg.sub1 + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE6]], 0, 0, implicit $exec :: (load (s8) from %ir.gep.le, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE6]], 1, 0, implicit $exec :: (load (s8) from %ir.gep.le + 1, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE1]], 8, [[GLOBAL_LOAD_UBYTE]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE2:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE6]], 2, 0, implicit $exec :: (load (s8) from %ir.gep.le + 2, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE3:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE6]], 3, 0, implicit $exec :: (load (s8) from %ir.gep.le + 3, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE3]], 8, [[GLOBAL_LOAD_UBYTE2]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_1]], 16, [[V_LSHL_OR_B32_e64_]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE4:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE2]], 0, 0, implicit $exec :: (load (s8) from %ir.p3, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE5:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE2]], 1, 0, implicit $exec :: (load (s8) from %ir.p3 + 1, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_3:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE5]], 8, [[GLOBAL_LOAD_UBYTE4]], implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE6:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE2]], 2, 0, implicit $exec :: (load (s8) from %ir.p3 + 2, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE7:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE2]], 3, 0, implicit $exec :: (load (s8) from %ir.p3 + 3, addrspace 1) + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_4:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[GLOBAL_LOAD_UBYTE7]], 8, [[GLOBAL_LOAD_UBYTE6]], implicit $exec + ; CHECK-NEXT: [[V_LSHL_OR_B32_e64_5:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[V_LSHL_OR_B32_e64_4]], 16, [[V_LSHL_OR_B32_e64_3]], implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.5, align 4, addrspace 5) + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[SI_SPILL_V64_RESTORE]], 0, 0, implicit $exec :: (load (s32) from %ir.p4, addrspace 1) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE2:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[COPY11]], %stack.6, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5) + ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[GLOBAL_LOAD_DWORD1]], [[SI_SPILL_V32_RESTORE2]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD1]], [[V_ADD_U32_e64_]], implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[V_MUL_LO_U32_e64_]], 0, 0, implicit $exec :: (store (s32) into %ir.p3, addrspace 1) + ; CHECK-NEXT: [[V_SUB_U32_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[V_MUL_LO_U32_e64_]], [[V_ADD_U32_e64_]], 0, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_SUB_U32_e64_]], [[SI_SPILL_V32_RESTORE1]], implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[SI_SPILL_V64_RESTORE]], [[V_MUL_LO_U32_e64_1]], 0, 0, implicit $exec :: (store (s32) into %ir.p4, addrspace 1) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE3:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[SI_SPILL_V32_RESTORE3]], [[V_LSHL_OR_B32_e64_5]], implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE4:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5) + ; CHECK-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 [[V_LSHL_OR_B32_e64_2]], [[SI_SPILL_V32_RESTORE4]], [[V_MUL_LO_U32_e64_2]], implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE1:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[SI_SPILL_V64_RESTORE1]], [[V_ADD3_U32_e64_]], 0, 0, implicit $exec :: (store (s32) into %ir.p5, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 +entry: +; entry +; | +; +<----+ +; loop | +; +-----+ +; | +; exit + %ld1 = load i32, ptr addrspace(1) %p2, align 4 + store i32 %TC, ptr addrspace(1) %p2 + br label %loop + +loop: + %phi = phi i32 [ 100, %entry ], [ %add, %loop ] + %phi.inc = phi i32 [ 0, %entry ], [ %inc, %loop ] + %sext = sext i32 %phi.inc to i64 + %gep = getelementptr inbounds i32, ptr addrspace(1) %p1, i64 %sext + %ld = load i32, ptr addrspace(1) %gep, align 1 + %add = add i32 %ld, %phi.inc + %inc = add i32 %phi.inc, 1 + %cond = icmp ult i32 %inc, %ld1 + br i1 %cond, label %loop, label %exit + +exit: + %ld2 = load i32, ptr addrspace(1) %p3, align 1 + %ld3 = load i32, ptr addrspace(1) %p4 + %add1 = add i32 %ld3, %inc + %mul1 = mul i32 %ld3, %add1 + store i32 %mul1, ptr addrspace(1) %p3 + %sub1 = sub i32 %mul1, %add1 + %mul2 = mul i32 %sub1, %TC + store i32 %mul2, ptr addrspace(1) %p4 + %mul3 = mul i32 %ld1, %ld2 + %add2 = add i32 %mul3, %add + store i32 %add2, ptr addrspace(1) %p5 + ret void +}