-
Notifications
You must be signed in to change notification settings - Fork 15.5k
Add Next-use distance analysis #171520
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Add Next-use distance analysis #171520
Conversation
|
@llvm/pr-subscribers-backend-amdgpu Author: Konstantina Mitropoulou (kmitropoulou) ChangesPatch is 520.69 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/171520.diff 21 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 5df11a45b4889..5e35ed64ec9cc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -45,6 +45,7 @@ FunctionPass *createSIWholeQuadModeLegacyPass();
FunctionPass *createSIFixControlFlowLiveIntervalsPass();
FunctionPass *createSIOptimizeExecMaskingPreRAPass();
FunctionPass *createSIOptimizeVGPRLiveRangeLegacyPass();
+FunctionPass *createAMDGPUNextUseAnalysisPass();
FunctionPass *createSIFixSGPRCopiesLegacyPass();
FunctionPass *createLowerWWMCopiesPass();
FunctionPass *createSIMemoryLegalizerPass();
@@ -191,6 +192,9 @@ extern char &SIFixSGPRCopiesLegacyID;
void initializeSIFixVGPRCopiesLegacyPass(PassRegistry &);
extern char &SIFixVGPRCopiesID;
+void initializeAMDGPUNextUseAnalysisPassPass(PassRegistry &);
+extern char &AMDGPUNextUseAnalysisID;
+
void initializeSILowerWWMCopiesLegacyPass(PassRegistry &);
extern char &SILowerWWMCopiesLegacyID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUNextUseAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUNextUseAnalysis.cpp
new file mode 100644
index 0000000000000..de9b443a49709
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUNextUseAnalysis.cpp
@@ -0,0 +1,573 @@
+//===---------------------- AMDGPUNextUseAnalysis.cpp --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUNextUseAnalysis.h"
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "llvm/InitializePasses.h"
+#include <limits>
+#include <queue>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-next-use-analysis"
+
+static cl::opt<bool> DumpNextUseDistance("dump-next-use-distance",
+ cl::init(false), cl::Hidden);
+
+bool AMDGPUNextUseAnalysis::isBackedge(MachineBasicBlock *From,
+ MachineBasicBlock *To) const {
+ MachineLoop *Loop1 = MLI->getLoopFor(From);
+ MachineLoop *Loop2 = MLI->getLoopFor(To);
+ if (!Loop1 || !Loop2 || Loop1 != Loop2)
+ return false;
+ SmallVector<MachineBasicBlock *, 2> Latches;
+ Loop1->getLoopLatches(Latches);
+ auto It = llvm::find(Latches, From);
+ MachineBasicBlock *LoopHeader = Loop1->getHeader();
+ return It != Latches.end() && From->isSuccessor(To) && To == LoopHeader;
+}
+
+// Calculate the shortest distance between two blocks using Dijkstra algorithm.
+std::pair<SmallVector<MachineBasicBlock *>, uint64_t>
+AMDGPUNextUseAnalysis::getShortestPath(MachineBasicBlock *FromMBB,
+ MachineBasicBlock *ToMBB) {
+ assert(FromMBB != ToMBB && "The basic blocks should be different.\n");
+ DenseSet<MachineBasicBlock *> Visited;
+ struct Data {
+ MachineBasicBlock *BestPred = nullptr;
+ uint64_t ShortestDistance = std::numeric_limits<uint64_t>::max();
+ };
+ DenseMap<MachineBasicBlock *, Data> MBBData;
+
+ auto Cmp = [&MBBData](MachineBasicBlock *MBB1, MachineBasicBlock *MBB2) {
+ return MBBData[MBB1].ShortestDistance > MBBData[MBB2].ShortestDistance;
+ };
+ std::priority_queue<MachineBasicBlock *, std::vector<MachineBasicBlock *>,
+ decltype(Cmp)>
+ Worklist(Cmp);
+
+ Worklist.push(FromMBB);
+ MBBData[FromMBB] = {nullptr, 0};
+
+ while (!Worklist.empty()) {
+ MachineBasicBlock *CurMBB = Worklist.top();
+ Worklist.pop();
+
+ if (!Visited.insert(CurMBB).second)
+ continue;
+
+ if (CurMBB == ToMBB) {
+ // We found the destination node, build the path ToMBB->...->FromMBB
+ SmallVector<MachineBasicBlock *> Path;
+ MachineBasicBlock *PathMBB = ToMBB;
+ while (PathMBB != nullptr) {
+ Path.push_back(PathMBB);
+ if (PathMBB == FromMBB)
+ break;
+ auto It = MBBData.find(PathMBB);
+ PathMBB = It != MBBData.end() ? It->second.BestPred : nullptr;
+ }
+ assert(Path.back() == FromMBB && "Incomplete path!");
+ auto *Pred = MBBData[CurMBB].BestPred;
+ return {Path, MBBData[Pred].ShortestDistance -
+ MBBData[FromMBB].ShortestDistance};
+ }
+
+ auto Pair = MBBData.try_emplace(
+ CurMBB, Data{nullptr, std::numeric_limits<uint64_t>::max()});
+ int CurrMBBDist = Pair.first->second.ShortestDistance;
+
+ for (MachineBasicBlock *Succ : CurMBB->successors()) {
+ if (isBackedge(CurMBB, Succ))
+ continue;
+
+ auto GetDistance = [this, ToMBB](MachineBasicBlock *BB) -> uint64_t {
+ MachineLoop *LoopBB = MLI->getLoopFor(BB);
+ MachineLoop *LoopTo = MLI->getLoopFor(ToMBB);
+ if (LoopBB && LoopTo &&
+ (LoopTo->contains(LoopBB) && (LoopTo != LoopBB))) {
+ return BB->size() * LoopWeight *
+ (MLI->getLoopDepth(BB) - MLI->getLoopDepth(ToMBB));
+ }
+ if ((LoopBB && LoopTo && LoopBB->contains(LoopTo))) {
+ return BB->size();
+ }
+ if ((!LoopTo && LoopBB) ||
+ (LoopBB && LoopTo && !LoopTo->contains(LoopBB))) {
+ return BB->size() * LoopWeight * MLI->getLoopDepth(BB);
+ } else
+ return BB->size();
+ };
+ uint64_t NewSuccDist = CurrMBBDist + GetDistance(Succ);
+
+ auto &[SuccPred, SuccDist] = MBBData[Succ];
+ if (NewSuccDist < SuccDist) {
+ // We found a better path to Succ, update best predecessor and distance
+ SuccPred = CurMBB;
+ SuccDist = NewSuccDist;
+ }
+
+ Worklist.push(Succ);
+ }
+ }
+ return {{}, std::numeric_limits<uint64_t>::max()};
+}
+
+void AMDGPUNextUseAnalysis::calculateShortestPaths(MachineFunction &MF) {
+ for (MachineBasicBlock &MBB1 : MF) {
+ for (MachineBasicBlock &MBB2 : MF) {
+ if (&MBB1 == &MBB2)
+ continue;
+ ShortestPathTable[std::make_pair(&MBB1, &MBB2)] =
+ getShortestPath(&MBB1, &MBB2);
+ }
+ }
+}
+
+uint64_t AMDGPUNextUseAnalysis::calculateShortestDistance(MachineInstr *CurMI,
+ MachineInstr *UseMI) {
+ MachineBasicBlock *CurMBB = CurMI->getParent();
+ MachineBasicBlock *UseMBB = UseMI->getParent();
+
+ if (CurMBB == UseMBB)
+ return getInstrId(UseMI) - getInstrId(CurMI);
+
+ uint64_t CurMIDistanceToBBEnd =
+ getInstrId(&*(std::prev(CurMBB->instr_end()))) - getInstrId(CurMI);
+ uint64_t UseDistanceFromBBBegin = getInstrId(&*(UseMI->getIterator())) -
+ getInstrId(&*(UseMBB->instr_begin())) + 1;
+ auto Dst = getShortestDistanceFromTable(CurMBB, UseMBB);
+ assert(Dst != std::numeric_limits<uint64_t>::max());
+ return CurMIDistanceToBBEnd + Dst + UseDistanceFromBBBegin;
+}
+
+std::pair<uint64_t, MachineBasicBlock *>
+AMDGPUNextUseAnalysis::getShortestDistanceToExitingLatch(
+ MachineBasicBlock *CurMBB, MachineLoop *CurLoop) const {
+ SmallVector<MachineBasicBlock *, 2> Latches;
+ CurLoop->getLoopLatches(Latches);
+ uint64_t ShortestDistanceToLatch = std::numeric_limits<uint64_t>::max();
+ MachineBasicBlock *ExitingLatch = nullptr;
+
+ for (MachineBasicBlock *LMBB : Latches) {
+ if (LMBB == CurMBB)
+ return std::make_pair(0, CurMBB);
+
+ uint64_t Dst = getShortestDistanceFromTable(CurMBB, LMBB);
+ if (ShortestDistanceToLatch > Dst) {
+ ShortestDistanceToLatch = Dst;
+ ExitingLatch = LMBB;
+ }
+ }
+ return std::make_pair(ShortestDistanceToLatch, ExitingLatch);
+}
+
+std::pair<uint64_t, MachineBasicBlock *>
+AMDGPUNextUseAnalysis::getLoopDistanceAndExitingLatch(
+ MachineBasicBlock *CurMBB) const {
+ MachineLoop *CurLoop = MLI->getLoopFor(CurMBB);
+ MachineBasicBlock *LoopHeader = CurLoop->getHeader();
+ SmallVector<MachineBasicBlock *, 2> Latches;
+ CurLoop->getLoopLatches(Latches);
+ bool IsCurLoopLatch = llvm::any_of(
+ Latches, [&](MachineBasicBlock *LMBB) { return CurMBB == LMBB; });
+ MachineBasicBlock *ExitingLatch = nullptr;
+ uint64_t DistanceToLatch = 0;
+ uint64_t TotalDistance = 0;
+
+ if (CurLoop->getNumBlocks() == 1)
+ return std::make_pair(CurMBB->size(), CurMBB);
+
+ if (CurMBB == LoopHeader) {
+ std::tie(DistanceToLatch, ExitingLatch) =
+ getShortestDistanceToExitingLatch(CurMBB, CurLoop);
+ TotalDistance = LoopHeader->size() + DistanceToLatch + ExitingLatch->size();
+ return std::make_pair(TotalDistance, ExitingLatch);
+ }
+
+ if (IsCurLoopLatch) {
+ TotalDistance = LoopHeader->size() +
+ getShortestDistanceFromTable(LoopHeader, CurMBB) +
+ CurMBB->size();
+ return std::make_pair(TotalDistance, CurMBB);
+ }
+
+ auto LoopHeaderToCurMBBDistance =
+ getShortestDistanceFromTable(LoopHeader, CurMBB);
+
+ std::tie(DistanceToLatch, ExitingLatch) =
+ getShortestDistanceToExitingLatch(CurMBB, CurLoop);
+
+ TotalDistance = LoopHeader->size() + LoopHeaderToCurMBBDistance +
+ CurMBB->size() + DistanceToLatch + ExitingLatch->size();
+ return std::make_pair(TotalDistance, ExitingLatch);
+}
+
+// Calculates the overhead of a loop nest for three cases: 1. the use is outside
+// of the current loop, but they share the same loop nest 2. the use is
+// outside of the current loop nest and 3. the use is in a parent loop of the
+// current loop nest.
+std::pair<uint64_t, MachineBasicBlock *>
+AMDGPUNextUseAnalysis::getNestedLoopDistanceAndExitingLatch(
+ MachineBasicBlock *CurMBB, MachineBasicBlock *UseMBB,
+ bool IsUseOutsideOfTheCurrentLoopNest, bool IsUseInParentLoop) {
+ MachineLoop *CurLoop = MLI->getLoopFor(CurMBB);
+ MachineLoop *UseLoop = MLI->getLoopFor(UseMBB);
+
+ auto GetLoopDistance =
+ [&](MachineLoop *ML) -> std::pair<uint64_t, MachineBasicBlock *> {
+ uint64_t ShortestDistance = 0;
+ uint64_t TmpDist = 0;
+ MachineBasicBlock *ExitingLatch = nullptr;
+ unsigned UseLoopDepth =
+ IsUseOutsideOfTheCurrentLoopNest ? 0 : MLI->getLoopDepth(UseMBB);
+ if (ML->getNumBlocks() == 1) {
+ ShortestDistance = ML->getHeader()->size() *
+ (MLI->getLoopDepth(ML->getHeader()) - UseLoopDepth) *
+ LoopWeight;
+ return std::make_pair(ShortestDistance, ML->getLoopLatch());
+ }
+ std::tie(TmpDist, ExitingLatch) =
+ getLoopDistanceAndExitingLatch(ML->getHeader());
+ for (MachineBasicBlock *MBB :
+ getShortestPathFromTable(ML->getHeader(), ExitingLatch))
+ ShortestDistance +=
+ MBB->size() * (MLI->getLoopDepth(MBB) - UseLoopDepth) * LoopWeight;
+ return std::make_pair(ShortestDistance, ExitingLatch);
+ };
+
+ if (IsUseOutsideOfTheCurrentLoopNest) {
+ MachineLoop *OutermostLoop = CurLoop->getOutermostLoop();
+ if (OutermostLoop->contains(UseLoop)) {
+ // The CurLoop and the UseLoop are independent and they are in the same
+ // loop nest.
+ if (MLI->getLoopDepth(CurMBB) <= MLI->getLoopDepth(UseMBB)) {
+ return GetLoopDistance(CurLoop);
+ } else {
+ assert(CurLoop != OutermostLoop && "The loop cannot be the outermost.");
+ MachineLoop *OuterLoopOfCurLoop = CurLoop;
+ while (OutermostLoop != OuterLoopOfCurLoop &&
+ MLI->getLoopDepth(OuterLoopOfCurLoop->getHeader()) !=
+ MLI->getLoopDepth(UseMBB)) {
+ OuterLoopOfCurLoop = OuterLoopOfCurLoop->getParentLoop();
+ }
+ return GetLoopDistance(OuterLoopOfCurLoop);
+ }
+ } else {
+ // We should take into consideration the whole loop nest in the
+ // calculation of the distance because we will reach the use after
+ // executing the whole loop nest.
+ return GetLoopDistance(OutermostLoop);
+ }
+ } else if (IsUseInParentLoop) {
+ MachineLoop *UseLoopSubLoop = nullptr;
+ for (MachineLoop *ML : UseLoop->getSubLoopsVector()) {
+ // All the sub-loops of the UseLoop will be executed before the use.
+ // Hence, we should take this into consideration in distance calculation.
+ if (ML->contains(CurLoop)) {
+ UseLoopSubLoop = ML;
+ break;
+ }
+ }
+ return GetLoopDistance(UseLoopSubLoop);
+ }
+ llvm_unreachable("Failed to calculate the loop distance!");
+}
+
+uint64_t AMDGPUNextUseAnalysis::calculateCurLoopDistance(Register DefReg,
+ MachineInstr *CurMI,
+ MachineInstr *UseMI) {
+ MachineBasicBlock *CurMBB = CurMI->getParent();
+ MachineBasicBlock *UseMBB = UseMI->getParent();
+ MachineLoop *CurLoop = MLI->getLoopFor(CurMBB);
+ MachineLoop *UseLoop = MLI->getLoopFor(UseMBB);
+ uint64_t LoopDistance = 0;
+ MachineBasicBlock *ExitingLatch = nullptr;
+ bool IsUseInParentLoop = CurLoop && UseLoop &&
+ (UseLoop->contains(CurLoop) && (UseLoop != CurLoop));
+
+ bool IsUseOutsideOfTheCurrentLoopNest =
+ (!UseLoop && CurLoop) ||
+ (CurLoop && UseLoop && !UseLoop->contains(CurLoop) &&
+ !CurLoop->contains(UseLoop));
+
+ if (IsUseOutsideOfTheCurrentLoopNest) {
+ if (CurLoop->getSubLoops().empty() && CurLoop->isOutermost()) {
+ std::tie(LoopDistance, ExitingLatch) =
+ getLoopDistanceAndExitingLatch(CurMBB);
+ LoopDistance = LoopDistance * LoopWeight;
+ } else {
+ std::tie(LoopDistance, ExitingLatch) =
+ getNestedLoopDistanceAndExitingLatch(CurMBB, UseMBB, true, false);
+ }
+ } else if (IsUseInParentLoop) {
+ assert(MLI->getLoopDepth(UseMBB) < MLI->getLoopDepth(CurMBB) &&
+ "The loop depth of the current instruction must be bigger than "
+ "these.\n");
+ if (isIncomingValFromBackedge(CurMI, UseMI, DefReg))
+ return calculateBackedgeDistance(CurMI, UseMI);
+
+ // Get the loop distance of all the inner loops of UseLoop.
+ std::tie(LoopDistance, ExitingLatch) =
+ getNestedLoopDistanceAndExitingLatch(CurMBB, UseMBB, false, true);
+ }
+
+ uint64_t UseDistanceFromBBBegin = getInstrId(&*(UseMI->getIterator())) -
+ getInstrId(&*(UseMBB->instr_begin())) + 1;
+ return LoopDistance + getShortestDistanceFromTable(ExitingLatch, UseMBB) +
+ UseDistanceFromBBBegin;
+}
+
+uint64_t AMDGPUNextUseAnalysis::calculateBackedgeDistance(MachineInstr *CurMI,
+ MachineInstr *UseMI) {
+ MachineBasicBlock *CurMBB = CurMI->getParent();
+ MachineBasicBlock *UseMBB = UseMI->getParent();
+ MachineLoop *CurLoop = MLI->getLoopFor(CurMBB);
+ MachineLoop *UseLoop = MLI->getLoopFor(UseMBB);
+ assert(UseLoop && "There is no backedge.\n");
+ uint64_t CurMIDistanceToBBEnd =
+ getInstrId(&*(std::prev(CurMBB->instr_end()))) - getInstrId(CurMI);
+ uint64_t UseDistanceFromBBBegin = getInstrId(&*(UseMI->getIterator())) -
+ getInstrId(&*(UseMBB->instr_begin())) + 1;
+
+ if (!CurLoop)
+ return CurMIDistanceToBBEnd + getShortestDistanceFromTable(CurMBB, UseMBB) +
+ UseDistanceFromBBBegin;
+
+ if (CurLoop == UseLoop) {
+ auto [DistanceToLatch, ExitingLatch] =
+ getShortestDistanceToExitingLatch(CurMBB, CurLoop);
+ if (ExitingLatch == CurMBB)
+ return CurMIDistanceToBBEnd + UseDistanceFromBBBegin;
+ return UseDistanceFromBBBegin + CurMIDistanceToBBEnd + DistanceToLatch +
+ ExitingLatch->size();
+ }
+
+ if (!CurLoop->contains(UseLoop) && !UseLoop->contains(CurLoop)) {
+ auto [LoopDistance, ExitingLatch] = getLoopDistanceAndExitingLatch(CurMBB);
+ return LoopDistance + getShortestDistanceFromTable(ExitingLatch, UseMBB) +
+ UseDistanceFromBBBegin;
+ }
+
+ if (!CurLoop->contains(UseLoop)) {
+ auto [InnerLoopDistance, InnerLoopExitingLatch] =
+ getNestedLoopDistanceAndExitingLatch(CurMBB, UseMBB, false, true);
+ auto [DistanceToLatch, ExitingLatch] =
+ getShortestDistanceToExitingLatch(InnerLoopExitingLatch, UseLoop);
+ return InnerLoopDistance + DistanceToLatch + ExitingLatch->size() +
+ UseDistanceFromBBBegin;
+ }
+
+ llvm_unreachable("The backedge distance has not been calculated!");
+}
+
+bool AMDGPUNextUseAnalysis::isIncomingValFromBackedge(MachineInstr *CurMI,
+ MachineInstr *UseMI,
+ Register DefReg) const {
+ if (!UseMI->isPHI())
+ return false;
+
+ MachineLoop *CurLoop = MLI->getLoopFor(CurMI->getParent());
+ MachineLoop *UseLoop = MLI->getLoopFor(UseMI->getParent());
+
+ if (!UseLoop)
+ return false;
+
+ if (CurLoop && !UseLoop->contains(CurLoop))
+ return false;
+
+ if (UseMI->getParent() != UseLoop->getHeader())
+ return false;
+
+ SmallVector<MachineBasicBlock *, 2> Latches;
+ UseLoop->getLoopLatches(Latches);
+
+ bool IsNotIncomingValFromLatch = false;
+ bool IsIncomingValFromLatch = false;
+ auto Ops = UseMI->operands();
+ for (auto It = std::next(Ops.begin()), ItE = Ops.end(); It != ItE;
+ It = std::next(It, 2)) {
+ auto &RegMO = *It;
+ auto &MBBMO = *std::next(It);
+ assert(RegMO.isReg() && "Expected register operand of PHI");
+ assert(MBBMO.isMBB() && "Expected MBB operand of PHI");
+ if (RegMO.getReg() == DefReg) {
+ MachineBasicBlock *IncomingBB = MBBMO.getMBB();
+ auto It = llvm::find(Latches, IncomingBB);
+ if (It == Latches.end())
+ IsNotIncomingValFromLatch = true;
+ else
+ IsIncomingValFromLatch = true;
+ }
+ }
+ return IsIncomingValFromLatch && !IsNotIncomingValFromLatch;
+}
+
+void AMDGPUNextUseAnalysis::dumpShortestPaths() const {
+ for (const auto &P : ShortestPathTable) {
+ MachineBasicBlock *From = P.first.first;
+ MachineBasicBlock *To = P.first.second;
+ auto [ShortestPath, Dist] = P.second;
+ errs() << "From: " << From->getName() << "-> To:" << To->getName() << " = "
+ << Dist << "\n";
+ }
+}
+
+bool isVCCReg(Register Reg) {
+ return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI;
+}
+
+void AMDGPUNextUseAnalysis::printAllDistances(MachineFunction &MF) {
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : *&MBB) {
+ for (const MachineOperand &MO : MI.operands()) {
+ if (!MO.isReg())
+ continue;
+
+ Register Reg = MO.getReg();
+ if (Reg == 0)
+ continue;
+
+ if (MO.isUse())
+ continue;
+
+ if (Reg.isPhysical() || TRI->isAGPR(*MRI, Reg) || isVCCReg(Reg))
+ continue;
+
+ std::optional<uint64_t> NextUseDistance = getNextUseDistance(Reg);
+ errs() << "Next-use distance of Register " << printReg(Reg, TRI)
+ << " = ";
+ if (NextUseDistance)
+ errs() << *NextUseDistance;
+ else
+ errs() << "null";
+ errs() << "\n";
+ }
+ }
+ }
+}
+
+std::optional<uint64_t>
+AMDGPUNextUseAnalysis::getNextUseDistance(Register DefReg) {
+ assert(!DefReg.isPhysical() && !TRI->isAGPR(*MRI, DefReg) &&
+ !isVCCReg(DefReg) &&
+ "Next-use distance is calculated for SGPRs and VGPRs");
+ uint64_t NextUseDistance = std::numeric_limits<uint64_t>::max();
+ uint64_t CurrentNextUseDistance = std::numeric_limits<uint64_t>::max();
+ for (auto &UseMI : MRI->use_nodbg_instructions(DefReg)) {
+ MachineInstr *CurMI = &*MRI->def_instr_begin(DefReg);
+ MachineBasicBlock *CurMBB = CurMI->getParent();
+ MachineBasicBlock *UseMBB = UseMI.getParent();
+ MachineLoop *CurLoop = MLI->getLoopFor(CurMBB);
+ MachineLoop *UseLoop = MLI->getLoopFor(UseMBB);
+
+ bool IsUseOutsideOfTheDefintionLoop =
+ (CurLoop && !UseLoop) ||
+ (CurLoop && UseLoop &&
+ ((!UseLoop->contains(CurLoop) && !CurLoop->contains(UseLoop)) ||
+ (UseLoop->contains(CurLoop) && (UseLoop != CurLoop))));
+
+ if (IsUseOutsideOfTheDefintionLoop) {
+ CurrentNextUseDistance = calculateCurLoopDistance(DefReg, CurMI, &UseMI);
+ } else if (isIncomingValFromBackedge(CurMI, &UseMI, DefReg)) {
+ CurrentNextUseDistance = calculateBackedgeDistance(CurMI, &UseMI);
+ } else {
+ CurrentNextUseDistance = calculateShortestDistance(CurMI, &UseMI);
+ }
+
+ if (CurrentNextUseDistance < NextUseDistance)
+ NextUseDistance = CurrentNextUseDistance;
+ }
+ return NextUseDistance != std::numeric_limits<uint64_t>::ma...
[truncated]
|
You can test this locally with the following command:git diff -U0 --pickaxe-regex -S '([^a-zA-Z0-9#_-]undef([^a-zA-Z0-9_-]|$)|UndefValue::get)' 'HEAD~1' HEAD llvm/lib/Target/AMDGPU/AMDGPUNextUseAnalysis.cpp llvm/lib/Target/AMDGPU/AMDGPUNextUseAnalysis.h llvm/test/CodeGen/AMDGPU/test1.ll llvm/test/CodeGen/AMDGPU/test10.ll llvm/test/CodeGen/AMDGPU/test11.ll llvm/test/CodeGen/AMDGPU/test12.ll llvm/test/CodeGen/AMDGPU/test13.ll llvm/test/CodeGen/AMDGPU/test14.ll llvm/test/CodeGen/AMDGPU/test15.ll llvm/test/CodeGen/AMDGPU/test2.ll llvm/test/CodeGen/AMDGPU/test3.ll llvm/test/CodeGen/AMDGPU/test4.ll llvm/test/CodeGen/AMDGPU/test5.ll llvm/test/CodeGen/AMDGPU/test6.ll llvm/test/CodeGen/AMDGPU/test7.ll llvm/test/CodeGen/AMDGPU/test8.ll llvm/test/CodeGen/AMDGPU/test9.ll llvm/lib/Target/AMDGPU/AMDGPU.h llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp llvm/test/CodeGen/AMDGPU/llc-pipeline.llThe following files introduce new uses of undef:
Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields In tests, avoid using For example, this is considered a bad practice: define void @fn() {
...
br i1 undef, ...
}Please use the following instead: define void @fn(i1 %cond) {
...
br i1 %cond, ...
}Please refer to the Undefined Behavior Manual for more information. |
|
How does this relate to #156079? |
This implementation of the next-use distance is designed for ERS (#171523). The ERS pass is using this not only for the next-use distance heuristic, but also to check if a block is reachable by another one. So, the APIs are not exactly the same. There are some implementation differences e.g. the #156079 ignores the phi nodes in next-use distance calculation and it has support for sub-registers. |
| return {{}, std::numeric_limits<uint64_t>::max()}; | ||
| } | ||
|
|
||
| void AMDGPUNextUseAnalysis::calculateShortestPaths(MachineFunction &MF) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If I understand correctly, we should not spill too many registers. So later query should only happen for very limited number of blocks. Based on that, most of the computation here is just waste of compile time. I would like we defer such kind of path finding when we really query for the next use distance, so that we only need to work on small number of blocks.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I will switch this to on-demand query with caching.
| } | ||
|
|
||
| std::optional<uint64_t> | ||
| AMDGPUNextUseAnalysis::getNextUseDistance(Register DefReg, MachineInstr *CurMI, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Now that we have LiveIntervals in the ERS. You can pass in here, which can help you calculate the distance.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am not sure I follow. Could you please elaborate a bit?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I mean that you don't need to number the instructions inside ERS to calculate distance. You can calculate using LiveIntervals.
| insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeLegacyID); | ||
|
|
||
| if (OptNextUseAnalysis) | ||
| insertPass(&LiveVariablesID, &AMDGPUNextUseAnalysisID); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why? Shouldn't this only be added when it's requested by another pass?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I will remove it. I just have it for testing.
| ShortestPathTable; | ||
| /// We assume an approximate trip count of 1000 for all loops. | ||
| static constexpr const uint64_t LoopWeight = 1000; | ||
| /// Returns the shortest ditance from ShortestPathTable. Will crash if |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| /// Returns the shortest ditance from ShortestPathTable. Will crash if | |
| /// Returns the shortest distance from ShortestPathTable. Will crash if |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
| /// them and fills in \p ShortestPathTable. | ||
| void calculateShortestPaths(MachineFunction &MF); | ||
| /// If the path from \p MI to \p UseMI does not cross any loops, then this | ||
| /// \returns the shortest instruction distance between them. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should also describe what it returns if there are loops.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good catch! The comment was not phrased correctly. This is used only if MI and UseMI are not inside a loop.
| /// Given \p CurMI in a loop and \p UseMI outside the loop, this function | ||
| /// returns the minimum instruction path between \p CurMI and \p UseMI. | ||
| /// Please note that since \p CurMI is in a loop we don't care about the | ||
| /// exact position of the instruction in the block because we making a rough |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| /// exact position of the instruction in the block because we making a rough | |
| /// exact position of the instruction in the block because we're making a rough |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
| /// exact position of the instruction in the block because we making a rough | ||
| /// estimate of the dynamic instruction path length, given that the loop | ||
| /// iterates multiple times. | ||
| uint64_t calculateCurLoopDistance(Register DefReg, MachineInstr *CurMI, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why would this distance depend on DefReg?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is another function that is used only for testing. I added a TODO. It calculates the next-use distance for a definition register and it is only used by printAllDistances().
| std::pair<uint64_t, MachineBasicBlock *> | ||
| getLoopDistanceAndExitingLatch(MachineBasicBlock *CurMBB) const; | ||
|
|
||
| /// Returns the shortest ditance from ShortestPathTable. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think this comment adds much. Public APIs shouldn't really reference implementation details (plus, as other reviewers have suggested, the implementation might change).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You are right! It does not make sense. Removed it!
| Loop1->getLoopLatches(Latches); | ||
| auto It = llvm::find(Latches, From); | ||
| MachineBasicBlock *LoopHeader = Loop1->getHeader(); | ||
| return It != Latches.end() && From->isSuccessor(To) && To == LoopHeader; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can check isSuccessor right at the top of the function and exit early if it isn't. Likewise, you can check if To is Loop1's header before getting the loop latches and calling find on them.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
|
|
||
| uint64_t CurMIDistanceToBBEnd = | ||
| getInstrId(&*(std::prev(CurMBB->instr_end()))) - getInstrId(CurMI); | ||
| uint64_t UseDistanceFromBBBegin = getInstrId(&*(UseMI->getIterator())) - |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| uint64_t UseDistanceFromBBBegin = getInstrId(&*(UseMI->getIterator())) - | |
| uint64_t UseDistanceFromBBBegin = getInstrId(UseMI) - |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
:)
|
@kmitropoulou @alex-t PR #156079 and PR #171520 may be designed for different use cases, but they also clearly conflict with each other as they are attempting to add files with the same names. What is the plan for these two PRs? Does one supersede the other, or does one go first? Knowing this would help with efficient review. |
This analysis is used by ERS (Early Register Spilling) to find which register to spill.
It iterates among the uses of a definition. It calculates the shortest distance between the definition and the use. The next-use distance of a register is the shortest distance among all the uses.
There is a penalty for the values that are live across loops.
ERS chooses to spill the register with the longest next-use distance.