Rename aggrsssive fusion to fuse-multi-use (#12585)

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/FormDispatchRegions.cpp b/compiler/src/iree/compiler/Dialect/Flow/Transforms/FormDispatchRegions.cpp
index c6561c1..d667bf7 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/FormDispatchRegions.cpp
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/FormDispatchRegions.cpp
@@ -476,7 +476,7 @@
 static void fuseRootsWithConsumers(MLIRContext *context,
                                    ArrayRef<Operation *> roots,
                                    DominanceInfo const &dominanceInfo,
-                                   bool aggressiveFusion) {
+                                   bool fuseMultiUse) {
   // Fuse with consumers where possible.
   for (Operation *root : roots) {
     SmallVector<Operation *> workList;
@@ -496,8 +496,8 @@
         appendToFusionGroup(currRoot, rootNumber);
       };
 
-      Optional<OpOperand *> fusableUse = getFusableUse(
-          currRoot, dominanceInfo, /*fuseMultiUse=*/aggressiveFusion);
+      Optional<OpOperand *> fusableUse =
+          getFusableUse(currRoot, dominanceInfo, /*fuseMultiUse=*/fuseMultiUse);
       if (!fusableUse) continue;
 
       // Analyse the use to see if it is fusable.
@@ -553,7 +553,7 @@
 static void fuseRootsWithProducers(MLIRContext *context, Operation *root,
                                    unsigned groupNum,
                                    DominanceInfo const &dominanceInfo,
-                                   bool aggressiveFusion) {
+                                   bool fuseMultiUse) {
   SmallVector<Operation *> worklist;
   worklist.push_back(root);
   llvm::SmallBitVector rootOuterParallelLoops = getOuterParallelLoops(root);
@@ -567,8 +567,8 @@
         continue;
       }
 
-      Optional<OpOperand *> fusableUse = getFusableUse(
-          producer, dominanceInfo, /*fuseMultiUse=*/aggressiveFusion);
+      Optional<OpOperand *> fusableUse =
+          getFusableUse(producer, dominanceInfo, /*fuseMultiUse=*/fuseMultiUse);
       if (!fusableUse || fusableUse.value()->getOwner() != candidate) continue;
 
       if (!isFusableWithProducer(operand, rootOuterParallelLoops)) {
@@ -591,7 +591,7 @@
 /// enough to capture any heuristic.
 static unsigned decideFusableLinalgOps(FunctionOpInterface funcOp,
                                        DominanceInfo const &dominanceInfo,
-                                       bool aggressiveFusion) {
+                                       bool fuseMultiUse) {
   unsigned numRootOps = 0;
   MLIRContext *context = funcOp->getContext();
   OpBuilder builder(context);
@@ -609,11 +609,11 @@
       setRootAttribute(context, &op, newGroup);
 
       fuseRootsWithProducers(context, &op, newGroup, dominanceInfo,
-                             aggressiveFusion);
+                             fuseMultiUse);
       roots.push_back(&op);
     }
     roots = llvm::to_vector(llvm::reverse(roots));
-    fuseRootsWithConsumers(context, roots, dominanceInfo, aggressiveFusion);
+    fuseRootsWithConsumers(context, roots, dominanceInfo, fuseMultiUse);
   }
 
   // Once all root linalg ops have been tagged, put all remaining generic ops
@@ -636,7 +636,7 @@
       roots.push_back(&op);
     }
     roots = llvm::to_vector(llvm::reverse(roots));
-    fuseRootsWithConsumers(context, roots, dominanceInfo, aggressiveFusion);
+    fuseRootsWithConsumers(context, roots, dominanceInfo, fuseMultiUse);
   }
 
   return numRootOps;
@@ -689,11 +689,11 @@
                                         FunctionOpInterface funcOp,
                                         DominanceInfo const &dominanceInfo,
                                         bool generateWorkloadRegion,
-                                        bool aggressiveFusion) {
+                                        bool fuseMultiUse) {
   // Step 1: Decide fusion groups (heuristic). This marks rootOps with an
   // attribute
   unsigned numRoots =
-      decideFusableLinalgOps(funcOp, dominanceInfo, aggressiveFusion);
+      decideFusableLinalgOps(funcOp, dominanceInfo, fuseMultiUse);
   SmallVector<Operation *> roots(numRoots, nullptr);
   DenseMap<unsigned, SmallVector<Operation *>> producers;
 
@@ -779,12 +779,12 @@
         .insert<AffineDialect, IREE::Flow::FlowDialect, linalg::LinalgDialect,
                 scf::SCFDialect, tensor::TensorDialect>();
   }
-  FormDispatchRegionsPass(bool aggressiveFusion, bool generateWorkloadRegion) {
-    this->aggressiveFusion = aggressiveFusion;
+  FormDispatchRegionsPass(bool fuseMultiUse, bool generateWorkloadRegion) {
+    this->fuseMultiUse = fuseMultiUse;
     this->generateWorkloadRegion = generateWorkloadRegion;
   }
   FormDispatchRegionsPass(const FormDispatchRegionsPass &pass)
-      : FormDispatchRegionsPass(pass.aggressiveFusion,
+      : FormDispatchRegionsPass(pass.fuseMultiUse,
                                 pass.generateWorkloadRegion) {}
   void runOnOperation() override;
 };
@@ -796,16 +796,15 @@
   DominanceInfo const &dominanceInfo = getAnalysis<DominanceInfo>();
   TensorDimTrackingRewriter rewriter(funcOp);
   if (failed(createFusionGroups(rewriter, funcOp, dominanceInfo,
-                                generateWorkloadRegion, aggressiveFusion))) {
+                                generateWorkloadRegion, fuseMultiUse))) {
     funcOp->emitOpError("failed to create fusion groups");
     return signalPassFailure();
   }
 }
 
 std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
-createFormDispatchRegionsPass(bool aggressiveFusion,
-                              bool generateWorkloadRegion) {
-  return std::make_unique<FormDispatchRegionsPass>(aggressiveFusion,
+createFormDispatchRegionsPass(bool fuseMultiUse, bool generateWorkloadRegion) {
+  return std::make_unique<FormDispatchRegionsPass>(fuseMultiUse,
                                                    generateWorkloadRegion);
 }
 }  // namespace Flow
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.cpp b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.cpp
index 3a51129..989b789 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.cpp
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.cpp
@@ -66,11 +66,8 @@
     llvm::cl::desc("Enable fusing tensor.pad ops into Linalg consumer ops"),
     llvm::cl::init(false));
 
-static llvm::cl::opt<bool> clEnableAggressiveFusion(
-    "iree-flow-enable-aggressive-fusion",
-    llvm::cl::desc(
-        "Enable the aggressive fusion heuristic to fuse multiuse ops and ops "
-        "with reduction loops"),
+static llvm::cl::opt<bool> clEnableFuseMultiUse(
+    "iree-flow-fuse-multi-use", llvm::cl::desc("Fuse multi-use ops"),
     llvm::cl::init(false));
 
 static llvm::cl::opt<bool> clDispatchGenerateWorkloadRegion(
@@ -215,9 +212,8 @@
       .addPass(mlir::createCanonicalizerPass)
       .addPass(mlir::createCSEPass)
       // Elementwise fusion.
-      .addPass([]() {
-        return createFusionOfTensorOpsPass(clEnableAggressiveFusion);
-      })
+      .addPass(
+          []() { return createFusionOfTensorOpsPass(clEnableFuseMultiUse); })
       .addPass(mlir::createLinalgDetensorizePass)
       .addPass(mlir::createCanonicalizerPass)
       .addPass(mlir::createCSEPass)
@@ -247,7 +243,7 @@
       // transformations afterwards with a simple region and without bothering
       // producers.
       .addPass([&]() {
-        return createFormDispatchRegionsPass(clEnableAggressiveFusion,
+        return createFormDispatchRegionsPass(clEnableFuseMultiUse,
                                              clDispatchGenerateWorkloadRegion);
       })
       // Collapse dimensions of linalg Ops.
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.h b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.h
index ed5c1b2..3df95b0 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.h
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.h
@@ -140,7 +140,7 @@
 // is created for each tiled loop nest. This pass only moves the root compute op
 // into the dispatch region, allowing producers to be outside.
 std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
-createFormDispatchRegionsPass(bool aggressiveFusion = false,
+createFormDispatchRegionsPass(bool fuseMultiUse = false,
                               bool generateWorkloadRegion = true);
 
 // Pass to collapse dimensions of Linalg Ops on tensor ops.
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.td b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.td
index b2ab84c..6f85596 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.td
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.td
@@ -67,8 +67,8 @@
   let summary = "Form Dispatch Region Ops from Linalg operations on tensors to form dispatch.regions";
   let constructor = "mlir::iree_compiler::IREE::Flow::createFormDispatchRegionsPass()";
   let options = [
-    Option<"aggressiveFusion", "aggressive-fusion", "bool",
-           /*default=*/"false", "Fuse with aggressive heuristics">,
+    Option<"fuseMultiUse", "fuse-multi-use", "bool",
+           /*default=*/"false", "Fuse multi-use ops">,
     Option<"generateWorkloadRegion", "genereate-workload-region", "bool",
            /*default=*/"true", "Generate workload regions of WorkgroupOps">,
   ];
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/collapse_linalg_generic_on_tensors.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/collapse_linalg_generic_on_tensors.mlir
index 9737783..21b6fe9 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/collapse_linalg_generic_on_tensors.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/collapse_linalg_generic_on_tensors.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-flow-form-dispatch-regions{aggressive-fusion=true}, iree-flow-collapse-dimensions))" %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-flow-form-dispatch-regions{fuse-multi-use=true}, iree-flow-collapse-dimensions))" %s | FileCheck %s
 !type = tensor<2x4x8x16x32x64xf32>
 util.global private @"__transpose_10_input" {noinline} = dense<1.0> : !type
 
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors.mlir
index 121f288..fa729f4 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --split-input-file --verify-diagnostics --pass-pipeline="builtin.module(func.func(iree-flow-form-dispatch-regions{aggressive-fusion=true}, iree-flow-clone-producers-into-dispatch-regions, iree-flow-form-dispatch-workgroups), cse, canonicalize, cse)" %s | FileCheck %s
+// RUN: iree-opt --split-input-file --verify-diagnostics --pass-pipeline="builtin.module(func.func(iree-flow-form-dispatch-regions{fuse-multi-use=true}, iree-flow-clone-producers-into-dispatch-regions, iree-flow-form-dispatch-workgroups), cse, canonicalize, cse)" %s | FileCheck %s
 func.func @tile_matmul_alone(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>,
              %arg2 : tensor<?x?xf32>) -> tensor<?x?xf32> {
   %1 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors_fusion_with_transpose.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors_fusion_with_transpose.mlir
index 66d7c75..cc7cc7c 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors_fusion_with_transpose.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors_fusion_with_transpose.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --split-input-file --verify-diagnostics --pass-pipeline="builtin.module(func.func(iree-flow-interchange-transpose-generic-ops,iree-flow-form-dispatch-regions{aggressive-fusion=true}, iree-flow-form-dispatch-workgroups, canonicalize, cse))" %s | FileCheck %s
+// RUN: iree-opt --split-input-file --verify-diagnostics --pass-pipeline="builtin.module(func.func(iree-flow-interchange-transpose-generic-ops,iree-flow-form-dispatch-regions{fuse-multi-use=true}, iree-flow-form-dispatch-workgroups, canonicalize, cse))" %s | FileCheck %s
 
 func.func @fuse_batch_matmul_transpose(%a: tensor<4x384x384xf32>, %b: tensor<4x384x32xf32>) -> tensor<384x4x32xf32> {
   %cst = arith.constant 0.000000e+00 : f32
diff --git a/tests/e2e/regression/BUILD b/tests/e2e/regression/BUILD
index 857dad8..9730796 100644
--- a/tests/e2e/regression/BUILD
+++ b/tests/e2e/regression/BUILD
@@ -128,7 +128,7 @@
         "softmax_large.mlir",
     ],
     compiler_flags = [
-        "--iree-flow-enable-aggressive-fusion",
+        "--iree-flow-fuse-multi-use",
     ],
     driver = "cuda",
     tags = [
@@ -158,7 +158,7 @@
         "softmax.mlir",
     ],
     compiler_flags = [
-        "--iree-flow-enable-aggressive-fusion",
+        "--iree-flow-fuse-multi-use",
     ],
     driver = "local-task",
     target_backend = "llvm-cpu",
diff --git a/tests/e2e/regression/CMakeLists.txt b/tests/e2e/regression/CMakeLists.txt
index 6842933..6c7ea4d 100644
--- a/tests/e2e/regression/CMakeLists.txt
+++ b/tests/e2e/regression/CMakeLists.txt
@@ -172,7 +172,7 @@
   DRIVER
     "cuda"
   COMPILER_FLAGS
-    "--iree-flow-enable-aggressive-fusion"
+    "--iree-flow-fuse-multi-use"
   LABELS
     "noasan"
     "nomsan"
@@ -204,7 +204,7 @@
   DRIVER
     "local-task"
   COMPILER_FLAGS
-    "--iree-flow-enable-aggressive-fusion"
+    "--iree-flow-fuse-multi-use"
 )
 
 iree_check_single_backend_test_suite(
diff --git a/tests/transform_dialect/cuda/BUILD b/tests/transform_dialect/cuda/BUILD
index 0236360..1ba42e0 100644
--- a/tests/transform_dialect/cuda/BUILD
+++ b/tests/transform_dialect/cuda/BUILD
@@ -53,7 +53,7 @@
         #
         # FIXME: This must be used with the custom dispatch region formation
         # because IREE's does not fuse the 6 ops softmax version even with
-        # --iree-flow-enable-aggressive-fusion.
+        # --iree-flow-fuse-multi-use.
         #
         "softmax_dispatch_spec.mlir",
         # First few ops of softmax only, acts as a proxy example.
diff --git a/tests/transform_dialect/cuda/softmax.mlir b/tests/transform_dialect/cuda/softmax.mlir
index 9d96f48..7826c11 100644
--- a/tests/transform_dialect/cuda/softmax.mlir
+++ b/tests/transform_dialect/cuda/softmax.mlir
@@ -4,7 +4,7 @@
 // RUN:     --iree-flow-transformation-pipeline  \
 /// This must be used with the custom dispatch region formation
 /// because IREE's does not fuse the 6 ops softmax version even with
-/// --iree-flow-enable-aggressive-fusion.
+/// --iree-flow-fuse-multi-use.
 // RUN:     --iree-flow-dispatch-use-transform-dialect=%p/softmax_dispatch_spec.mlir \
 // RUN:     --iree-stream-transformation-pipeline \
 // RUN:     --iree-hal-configuration-pipeline | \
@@ -18,7 +18,7 @@
 /// flags leak to the JIT session, which doesn't know what to do with them.
 /// This must be used with the custom dispatch region formation
 /// because IREE's does not fuse the 6 ops softmax version even with
-/// --iree-flow-enable-aggressive-fusion.
+/// --iree-flow-fuse-multi-use.
 // RUN:     --iree-flow-dispatch-use-transform-dialect=%p/softmax_dispatch_spec.mlir \
 // RUN:     --iree-codegen-llvmgpu-use-transform-dialect=%p/softmax_codegen_spec.mlir | \
 // RUN: iree-run-module --function=softmax --device=cuda | \
diff --git a/tests/transform_dialect/cuda/softmax_codegen_spec.mlir b/tests/transform_dialect/cuda/softmax_codegen_spec.mlir
index 64eda2a..8e8d40c 100644
--- a/tests/transform_dialect/cuda/softmax_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/softmax_codegen_spec.mlir
@@ -17,7 +17,7 @@
   // Step 1. First level of tiling + fusion parallelizes to blocks.
   // ==============================================================
   // This must be used with the custom dispatch region formation because IREE's
-  // does not fuse even with --iree-flow-enable-aggressive-fusion.
+  // does not fuse even with --iree-flow-fuse-multi-use.
   // %forall, %_ =
   // transform.iree.tile_to_forall_and_workgroup_count_region %div tile_sizes [1, 4]
   //   ( mapping = [#gpu.thread<x>, #gpu.thread<y>] )
diff --git a/tests/transform_dialect/cuda/softmax_dispatch_spec.mlir b/tests/transform_dialect/cuda/softmax_dispatch_spec.mlir
index 3084926..804ce2d 100644
--- a/tests/transform_dialect/cuda/softmax_dispatch_spec.mlir
+++ b/tests/transform_dialect/cuda/softmax_dispatch_spec.mlir
@@ -13,7 +13,7 @@
 
   /// This must be used with the custom dispatch region formation
   /// because IREE's does not fuse the 6 ops softmax version even with
-  /// --iree-flow-enable-aggressive-fusion.
+  /// --iree-flow-fuse-multi-use.
   %region_op = transform.iree.wrap_in_dispatch_region %div { generateWorkload = false }
 
   %non_div = transform.merge_handles %input_max_fill, %input_max, %exps_sum_fill, %exps, %exps_sum
diff --git a/tests/transform_dialect/cuda/softmax_v2.mlir b/tests/transform_dialect/cuda/softmax_v2.mlir
index d72bde6..76ebb0d 100644
--- a/tests/transform_dialect/cuda/softmax_v2.mlir
+++ b/tests/transform_dialect/cuda/softmax_v2.mlir
@@ -1,7 +1,7 @@
 // RUN: iree-opt %s --iree-hal-target-backends=cuda \
 // RUN:     --iree-abi-transformation-pipeline \
 // RUN:     --iree-flow-transformation-pipeline  \
-// RUN:     --iree-flow-enable-aggressive-fusion \
+// RUN:     --iree-flow-fuse-multi-use \
 // RUN:     --iree-stream-transformation-pipeline \
 // RUN:     --iree-hal-configuration-pipeline | \
 // RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-llvmgpu-lower-executable-target)))' \
@@ -13,7 +13,7 @@
 // RUN:     --iree-opt-const-expr-hoisting=false --iree-opt-const-eval=false \
 /// Constant JIT'ing must be disabled because the transform-dialect debug
 /// flags leak to the JIT session, which doesn't know what to do with them.
-// RUN:     --iree-flow-enable-aggressive-fusion \
+// RUN:     --iree-flow-fuse-multi-use \
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
 // RUN:     --iree-codegen-llvmgpu-use-transform-dialect=%p/softmax_v2_codegen_spec.mlir | \
 // RUN: iree-run-module --function=softmax --device=cuda | \