Improving stream partitioning debug output speed by like 10000x. (#11067)

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Analysis/Partitioning.cpp b/compiler/src/iree/compiler/Dialect/Stream/Analysis/Partitioning.cpp
index f54b670..c6b7593 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Analysis/Partitioning.cpp
+++ b/compiler/src/iree/compiler/Dialect/Stream/Analysis/Partitioning.cpp
@@ -20,7 +20,7 @@
 
 #ifndef NDEBUG
 
-void dumpPartition(Partition &partition, AsmState &state) {
+void dumpPartition(Partition &partition, AsmState &asmState) {
   if (partition.affinity) {
     llvm::dbgs() << " AFFINITY: ";
     partition.affinity.dump();
@@ -28,36 +28,32 @@
   }
   llvm::dbgs() << " INS:\n  ";
   llvm::interleaveComma(partition.ins, llvm::dbgs(), [&](Value in) {
-    in.printAsOperand(llvm::dbgs(), state);
+    in.printAsOperand(llvm::dbgs(), asmState);
   });
   llvm::dbgs() << "\n OUTS:\n  ";
   llvm::interleaveComma(partition.outs, llvm::dbgs(), [&](Value out) {
-    out.printAsOperand(llvm::dbgs(), state);
+    out.printAsOperand(llvm::dbgs(), asmState);
   });
   llvm::dbgs() << "\n OPS:\n";
   for (auto *op : llvm::reverse(partition.ops)) {
     llvm::dbgs() << "  ";
-    op->print(llvm::dbgs(), state);
+    op->print(llvm::dbgs(), asmState);
     llvm::dbgs() << "\n";
   }
 }
 
-void Partition::dump(Operation *parentOp) {
-  AsmState state(parentOp);
-  dumpPartition(*this, state);
-}
+void Partition::dump(AsmState &asmState) { dumpPartition(*this, asmState); }
 
-void PartitionSet::dump(Operation *parentOp) {
-  AsmState state(parentOp);
+void PartitionSet::dump(AsmState &asmState) {
   for (auto partition : llvm::enumerate(partitions)) {
     llvm::dbgs() << "PARTITION[" << partition.index() << "]:\n";
-    dumpPartition(partition.value(), state);
+    dumpPartition(partition.value(), asmState);
   }
 }
 
 #else
-void Partition::dump(Operation *parentOp) {}
-void PartitionSet::dump(Operation *parentOp) {}
+void Partition::dump(AsmState &asmState) {}
+void PartitionSet::dump(AsmState &asmState) {}
 #endif  // !NDEBUG
 
 LogicalResult Partition::verify(Location loc) {
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Analysis/Partitioning.h b/compiler/src/iree/compiler/Dialect/Stream/Analysis/Partitioning.h
index fee0047..197c0d0 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Analysis/Partitioning.h
+++ b/compiler/src/iree/compiler/Dialect/Stream/Analysis/Partitioning.h
@@ -37,7 +37,7 @@
   // streamable (such as constants and arithmetic).
   SetVector<Operation *> ops;
 
-  void dump(Operation *parentOp);
+  void dump(AsmState &asmState);
 
   // Verifies that the partition meets the required conditions.
   LogicalResult verify(Location loc);
@@ -53,7 +53,7 @@
   // Returns true if the set is empty (no streamable ops).
   bool empty() const { return partitions.empty(); }
 
-  void dump(Operation *parentOp);
+  void dump(AsmState &asmState);
 
   // Verifies that the partition set meets the required conditions.
   LogicalResult verify(Location loc);
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Analysis/Partitioning/ReferencePartitioning.cpp b/compiler/src/iree/compiler/Dialect/Stream/Analysis/Partitioning/ReferencePartitioning.cpp
index f687d5e..7e49fb8 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Analysis/Partitioning/ReferencePartitioning.cpp
+++ b/compiler/src/iree/compiler/Dialect/Stream/Analysis/Partitioning/ReferencePartitioning.cpp
@@ -9,6 +9,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Support/Debug.h"
+#include "mlir/IR/AsmState.h"
 #include "mlir/IR/PatternMatch.h"
 
 #define DEBUG_TYPE "iree-stream-partitioning"
@@ -18,6 +19,24 @@
 namespace IREE {
 namespace Stream {
 
+// Returns an AsmState at the ancestor to |block| that is isolated from above.
+// Returns nullptr if debug dumps of partitioning is disabled.
+static std::unique_ptr<AsmState> getRootAsmState(Block *block) {
+  LLVM_DEBUG({
+    auto *rootOp = block->getParentOp();
+    while (auto parentOp = rootOp->getParentOp()) {
+      if (!isa<IREE::Stream::TimelineOpInterface>(parentOp) &&
+          parentOp->hasTrait<OpTrait::IsIsolatedFromAbove>()) {
+        rootOp = parentOp;
+        break;
+      }
+      rootOp = parentOp;
+    }
+    return std::make_unique<AsmState>(rootOp);
+  });
+  return nullptr;
+}
+
 // This is terrible. See Stream/Analysis/Partition.h for a description of what
 // a real implementation would do. We want cost modeling for tie breakers when
 // an op could be in multiple partitions, cloning for ops that are not worth
@@ -53,6 +72,8 @@
   };
   DenseMap<Operation *, OpInfo> opInfos;
 
+  auto asmState = getRootAsmState(block);
+
   for (auto &op : llvm::reverse(*block)) {
     // Skip constants; they just add noise (and since they are heavily CSE'd
     // they have lots of users to test).
@@ -65,7 +86,7 @@
       if (!mlir::wouldOpBeTriviallyDead(&op)) {
         LLVM_DEBUG({
           llvm::dbgs() << "Side-effecting op forcing flush and freeze:\n";
-          op.dump();
+          op.print(llvm::dbgs(), *asmState);
         });
         usableBuilders.reset();
       }
@@ -88,7 +109,7 @@
 
     LLVM_DEBUG({
       llvm::dbgs() << "====\nPartitioning op:\n";
-      op.dump();
+      op.print(llvm::dbgs(), *asmState);
     });
 
     // Set bits for each partition this op may be able to be placed into.
@@ -99,7 +120,7 @@
       auto &userInfo = opInfos[user];
       LLVM_DEBUG({
         llvm::dbgs() << "Testing user:\n";
-        user->dump();
+        user->print(llvm::dbgs(), *asmState);
         for (auto membershipOrdinal : userInfo.membership.set_bits()) {
           llvm::dbgs() << "  member of partition " << membershipOrdinal << "\n";
         }
@@ -230,7 +251,7 @@
     partitionSet.partitions.push_back(std::move(partition));
   }
 
-  LLVM_DEBUG(partitionSet.dump(block->getParentOp()));
+  LLVM_DEBUG(partitionSet.dump(*asmState));
 
   return partitionSet;
 }
@@ -243,7 +264,7 @@
 
   auto favor = config.getFavor().getValue();
   if (favor == IREE::Stream::Favor::Debug) {
-    // Disable partitioning when favoring debugability.
+    // Disable partitioning when favoring debuggability.
     return waveSet;
   }
 
@@ -262,6 +283,8 @@
   };
   DenseMap<Operation *, OpInfo> opInfos;
 
+  auto asmState = getRootAsmState(block);
+
   for (auto &op : llvm::reverse(*block)) {
     // Skip constants; they just add noise (and since they are heavily CSE'd
     // they have lots of users to test).
@@ -281,7 +304,7 @@
 
     LLVM_DEBUG({
       llvm::dbgs() << "====\nPartitioning op:\n";
-      op.dump();
+      op.print(llvm::dbgs(), *asmState);
     });
 
     // Set bits for each wave this op may be able to be placed into.
@@ -292,7 +315,7 @@
       auto &userInfo = opInfos[user];
       LLVM_DEBUG({
         llvm::dbgs() << "Testing user:\n";
-        user->dump();
+        user->print(llvm::dbgs(), *asmState);
         for (auto membershipOrdinal : userInfo.membership.set_bits()) {
           llvm::dbgs() << "  member of wave " << membershipOrdinal << "\n";
         }
@@ -375,7 +398,7 @@
     waveSet.partitions.push_back(std::move(wave));
   }
 
-  LLVM_DEBUG(waveSet.dump(block->getParentOp()));
+  LLVM_DEBUG(waveSet.dump(*asmState));
 
   return waveSet;
 }