Rename aggrsssive fusion to fuse-multi-use (#12585)
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/FormDispatchRegions.cpp b/compiler/src/iree/compiler/Dialect/Flow/Transforms/FormDispatchRegions.cpp index c6561c1..d667bf7 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/FormDispatchRegions.cpp +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/FormDispatchRegions.cpp
@@ -476,7 +476,7 @@ static void fuseRootsWithConsumers(MLIRContext *context, ArrayRef<Operation *> roots, DominanceInfo const &dominanceInfo, - bool aggressiveFusion) { + bool fuseMultiUse) { // Fuse with consumers where possible. for (Operation *root : roots) { SmallVector<Operation *> workList; @@ -496,8 +496,8 @@ appendToFusionGroup(currRoot, rootNumber); }; - Optional<OpOperand *> fusableUse = getFusableUse( - currRoot, dominanceInfo, /*fuseMultiUse=*/aggressiveFusion); + Optional<OpOperand *> fusableUse = + getFusableUse(currRoot, dominanceInfo, /*fuseMultiUse=*/fuseMultiUse); if (!fusableUse) continue; // Analyse the use to see if it is fusable. @@ -553,7 +553,7 @@ static void fuseRootsWithProducers(MLIRContext *context, Operation *root, unsigned groupNum, DominanceInfo const &dominanceInfo, - bool aggressiveFusion) { + bool fuseMultiUse) { SmallVector<Operation *> worklist; worklist.push_back(root); llvm::SmallBitVector rootOuterParallelLoops = getOuterParallelLoops(root); @@ -567,8 +567,8 @@ continue; } - Optional<OpOperand *> fusableUse = getFusableUse( - producer, dominanceInfo, /*fuseMultiUse=*/aggressiveFusion); + Optional<OpOperand *> fusableUse = + getFusableUse(producer, dominanceInfo, /*fuseMultiUse=*/fuseMultiUse); if (!fusableUse || fusableUse.value()->getOwner() != candidate) continue; if (!isFusableWithProducer(operand, rootOuterParallelLoops)) { @@ -591,7 +591,7 @@ /// enough to capture any heuristic. static unsigned decideFusableLinalgOps(FunctionOpInterface funcOp, DominanceInfo const &dominanceInfo, - bool aggressiveFusion) { + bool fuseMultiUse) { unsigned numRootOps = 0; MLIRContext *context = funcOp->getContext(); OpBuilder builder(context); @@ -609,11 +609,11 @@ setRootAttribute(context, &op, newGroup); fuseRootsWithProducers(context, &op, newGroup, dominanceInfo, - aggressiveFusion); + fuseMultiUse); roots.push_back(&op); } roots = llvm::to_vector(llvm::reverse(roots)); - fuseRootsWithConsumers(context, roots, dominanceInfo, aggressiveFusion); + fuseRootsWithConsumers(context, roots, dominanceInfo, fuseMultiUse); } // Once all root linalg ops have been tagged, put all remaining generic ops @@ -636,7 +636,7 @@ roots.push_back(&op); } roots = llvm::to_vector(llvm::reverse(roots)); - fuseRootsWithConsumers(context, roots, dominanceInfo, aggressiveFusion); + fuseRootsWithConsumers(context, roots, dominanceInfo, fuseMultiUse); } return numRootOps; @@ -689,11 +689,11 @@ FunctionOpInterface funcOp, DominanceInfo const &dominanceInfo, bool generateWorkloadRegion, - bool aggressiveFusion) { + bool fuseMultiUse) { // Step 1: Decide fusion groups (heuristic). This marks rootOps with an // attribute unsigned numRoots = - decideFusableLinalgOps(funcOp, dominanceInfo, aggressiveFusion); + decideFusableLinalgOps(funcOp, dominanceInfo, fuseMultiUse); SmallVector<Operation *> roots(numRoots, nullptr); DenseMap<unsigned, SmallVector<Operation *>> producers; @@ -779,12 +779,12 @@ .insert<AffineDialect, IREE::Flow::FlowDialect, linalg::LinalgDialect, scf::SCFDialect, tensor::TensorDialect>(); } - FormDispatchRegionsPass(bool aggressiveFusion, bool generateWorkloadRegion) { - this->aggressiveFusion = aggressiveFusion; + FormDispatchRegionsPass(bool fuseMultiUse, bool generateWorkloadRegion) { + this->fuseMultiUse = fuseMultiUse; this->generateWorkloadRegion = generateWorkloadRegion; } FormDispatchRegionsPass(const FormDispatchRegionsPass &pass) - : FormDispatchRegionsPass(pass.aggressiveFusion, + : FormDispatchRegionsPass(pass.fuseMultiUse, pass.generateWorkloadRegion) {} void runOnOperation() override; }; @@ -796,16 +796,15 @@ DominanceInfo const &dominanceInfo = getAnalysis<DominanceInfo>(); TensorDimTrackingRewriter rewriter(funcOp); if (failed(createFusionGroups(rewriter, funcOp, dominanceInfo, - generateWorkloadRegion, aggressiveFusion))) { + generateWorkloadRegion, fuseMultiUse))) { funcOp->emitOpError("failed to create fusion groups"); return signalPassFailure(); } } std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>> -createFormDispatchRegionsPass(bool aggressiveFusion, - bool generateWorkloadRegion) { - return std::make_unique<FormDispatchRegionsPass>(aggressiveFusion, +createFormDispatchRegionsPass(bool fuseMultiUse, bool generateWorkloadRegion) { + return std::make_unique<FormDispatchRegionsPass>(fuseMultiUse, generateWorkloadRegion); } } // namespace Flow
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.cpp b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.cpp index 3a51129..989b789 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.cpp +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.cpp
@@ -66,11 +66,8 @@ llvm::cl::desc("Enable fusing tensor.pad ops into Linalg consumer ops"), llvm::cl::init(false)); -static llvm::cl::opt<bool> clEnableAggressiveFusion( - "iree-flow-enable-aggressive-fusion", - llvm::cl::desc( - "Enable the aggressive fusion heuristic to fuse multiuse ops and ops " - "with reduction loops"), +static llvm::cl::opt<bool> clEnableFuseMultiUse( + "iree-flow-fuse-multi-use", llvm::cl::desc("Fuse multi-use ops"), llvm::cl::init(false)); static llvm::cl::opt<bool> clDispatchGenerateWorkloadRegion( @@ -215,9 +212,8 @@ .addPass(mlir::createCanonicalizerPass) .addPass(mlir::createCSEPass) // Elementwise fusion. - .addPass([]() { - return createFusionOfTensorOpsPass(clEnableAggressiveFusion); - }) + .addPass( + []() { return createFusionOfTensorOpsPass(clEnableFuseMultiUse); }) .addPass(mlir::createLinalgDetensorizePass) .addPass(mlir::createCanonicalizerPass) .addPass(mlir::createCSEPass) @@ -247,7 +243,7 @@ // transformations afterwards with a simple region and without bothering // producers. .addPass([&]() { - return createFormDispatchRegionsPass(clEnableAggressiveFusion, + return createFormDispatchRegionsPass(clEnableFuseMultiUse, clDispatchGenerateWorkloadRegion); }) // Collapse dimensions of linalg Ops.
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.h b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.h index ed5c1b2..3df95b0 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.h +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.h
@@ -140,7 +140,7 @@ // is created for each tiled loop nest. This pass only moves the root compute op // into the dispatch region, allowing producers to be outside. std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>> -createFormDispatchRegionsPass(bool aggressiveFusion = false, +createFormDispatchRegionsPass(bool fuseMultiUse = false, bool generateWorkloadRegion = true); // Pass to collapse dimensions of Linalg Ops on tensor ops.
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.td b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.td index b2ab84c..6f85596 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.td +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.td
@@ -67,8 +67,8 @@ let summary = "Form Dispatch Region Ops from Linalg operations on tensors to form dispatch.regions"; let constructor = "mlir::iree_compiler::IREE::Flow::createFormDispatchRegionsPass()"; let options = [ - Option<"aggressiveFusion", "aggressive-fusion", "bool", - /*default=*/"false", "Fuse with aggressive heuristics">, + Option<"fuseMultiUse", "fuse-multi-use", "bool", + /*default=*/"false", "Fuse multi-use ops">, Option<"generateWorkloadRegion", "genereate-workload-region", "bool", /*default=*/"true", "Generate workload regions of WorkgroupOps">, ];
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/collapse_linalg_generic_on_tensors.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/collapse_linalg_generic_on_tensors.mlir index 9737783..21b6fe9 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/collapse_linalg_generic_on_tensors.mlir +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/collapse_linalg_generic_on_tensors.mlir
@@ -1,4 +1,4 @@ -// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-flow-form-dispatch-regions{aggressive-fusion=true}, iree-flow-collapse-dimensions))" %s | FileCheck %s +// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-flow-form-dispatch-regions{fuse-multi-use=true}, iree-flow-collapse-dimensions))" %s | FileCheck %s !type = tensor<2x4x8x16x32x64xf32> util.global private @"__transpose_10_input" {noinline} = dense<1.0> : !type
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors.mlir index 121f288..fa729f4 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors.mlir +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors.mlir
@@ -1,4 +1,4 @@ -// RUN: iree-opt --split-input-file --verify-diagnostics --pass-pipeline="builtin.module(func.func(iree-flow-form-dispatch-regions{aggressive-fusion=true}, iree-flow-clone-producers-into-dispatch-regions, iree-flow-form-dispatch-workgroups), cse, canonicalize, cse)" %s | FileCheck %s +// RUN: iree-opt --split-input-file --verify-diagnostics --pass-pipeline="builtin.module(func.func(iree-flow-form-dispatch-regions{fuse-multi-use=true}, iree-flow-clone-producers-into-dispatch-regions, iree-flow-form-dispatch-workgroups), cse, canonicalize, cse)" %s | FileCheck %s func.func @tile_matmul_alone(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>, %arg2 : tensor<?x?xf32>) -> tensor<?x?xf32> { %1 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors_fusion_with_transpose.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors_fusion_with_transpose.mlir index 66d7c75..cc7cc7c 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors_fusion_with_transpose.mlir +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors_fusion_with_transpose.mlir
@@ -1,4 +1,4 @@ -// RUN: iree-opt --split-input-file --verify-diagnostics --pass-pipeline="builtin.module(func.func(iree-flow-interchange-transpose-generic-ops,iree-flow-form-dispatch-regions{aggressive-fusion=true}, iree-flow-form-dispatch-workgroups, canonicalize, cse))" %s | FileCheck %s +// RUN: iree-opt --split-input-file --verify-diagnostics --pass-pipeline="builtin.module(func.func(iree-flow-interchange-transpose-generic-ops,iree-flow-form-dispatch-regions{fuse-multi-use=true}, iree-flow-form-dispatch-workgroups, canonicalize, cse))" %s | FileCheck %s func.func @fuse_batch_matmul_transpose(%a: tensor<4x384x384xf32>, %b: tensor<4x384x32xf32>) -> tensor<384x4x32xf32> { %cst = arith.constant 0.000000e+00 : f32
diff --git a/tests/e2e/regression/BUILD b/tests/e2e/regression/BUILD index 857dad8..9730796 100644 --- a/tests/e2e/regression/BUILD +++ b/tests/e2e/regression/BUILD
@@ -128,7 +128,7 @@ "softmax_large.mlir", ], compiler_flags = [ - "--iree-flow-enable-aggressive-fusion", + "--iree-flow-fuse-multi-use", ], driver = "cuda", tags = [ @@ -158,7 +158,7 @@ "softmax.mlir", ], compiler_flags = [ - "--iree-flow-enable-aggressive-fusion", + "--iree-flow-fuse-multi-use", ], driver = "local-task", target_backend = "llvm-cpu",
diff --git a/tests/e2e/regression/CMakeLists.txt b/tests/e2e/regression/CMakeLists.txt index 6842933..6c7ea4d 100644 --- a/tests/e2e/regression/CMakeLists.txt +++ b/tests/e2e/regression/CMakeLists.txt
@@ -172,7 +172,7 @@ DRIVER "cuda" COMPILER_FLAGS - "--iree-flow-enable-aggressive-fusion" + "--iree-flow-fuse-multi-use" LABELS "noasan" "nomsan" @@ -204,7 +204,7 @@ DRIVER "local-task" COMPILER_FLAGS - "--iree-flow-enable-aggressive-fusion" + "--iree-flow-fuse-multi-use" ) iree_check_single_backend_test_suite(
diff --git a/tests/transform_dialect/cuda/BUILD b/tests/transform_dialect/cuda/BUILD index 0236360..1ba42e0 100644 --- a/tests/transform_dialect/cuda/BUILD +++ b/tests/transform_dialect/cuda/BUILD
@@ -53,7 +53,7 @@ # # FIXME: This must be used with the custom dispatch region formation # because IREE's does not fuse the 6 ops softmax version even with - # --iree-flow-enable-aggressive-fusion. + # --iree-flow-fuse-multi-use. # "softmax_dispatch_spec.mlir", # First few ops of softmax only, acts as a proxy example.
diff --git a/tests/transform_dialect/cuda/softmax.mlir b/tests/transform_dialect/cuda/softmax.mlir index 9d96f48..7826c11 100644 --- a/tests/transform_dialect/cuda/softmax.mlir +++ b/tests/transform_dialect/cuda/softmax.mlir
@@ -4,7 +4,7 @@ // RUN: --iree-flow-transformation-pipeline \ /// This must be used with the custom dispatch region formation /// because IREE's does not fuse the 6 ops softmax version even with -/// --iree-flow-enable-aggressive-fusion. +/// --iree-flow-fuse-multi-use. // RUN: --iree-flow-dispatch-use-transform-dialect=%p/softmax_dispatch_spec.mlir \ // RUN: --iree-stream-transformation-pipeline \ // RUN: --iree-hal-configuration-pipeline | \ @@ -18,7 +18,7 @@ /// flags leak to the JIT session, which doesn't know what to do with them. /// This must be used with the custom dispatch region formation /// because IREE's does not fuse the 6 ops softmax version even with -/// --iree-flow-enable-aggressive-fusion. +/// --iree-flow-fuse-multi-use. // RUN: --iree-flow-dispatch-use-transform-dialect=%p/softmax_dispatch_spec.mlir \ // RUN: --iree-codegen-llvmgpu-use-transform-dialect=%p/softmax_codegen_spec.mlir | \ // RUN: iree-run-module --function=softmax --device=cuda | \
diff --git a/tests/transform_dialect/cuda/softmax_codegen_spec.mlir b/tests/transform_dialect/cuda/softmax_codegen_spec.mlir index 64eda2a..8e8d40c 100644 --- a/tests/transform_dialect/cuda/softmax_codegen_spec.mlir +++ b/tests/transform_dialect/cuda/softmax_codegen_spec.mlir
@@ -17,7 +17,7 @@ // Step 1. First level of tiling + fusion parallelizes to blocks. // ============================================================== // This must be used with the custom dispatch region formation because IREE's - // does not fuse even with --iree-flow-enable-aggressive-fusion. + // does not fuse even with --iree-flow-fuse-multi-use. // %forall, %_ = // transform.iree.tile_to_forall_and_workgroup_count_region %div tile_sizes [1, 4] // ( mapping = [#gpu.thread<x>, #gpu.thread<y>] )
diff --git a/tests/transform_dialect/cuda/softmax_dispatch_spec.mlir b/tests/transform_dialect/cuda/softmax_dispatch_spec.mlir index 3084926..804ce2d 100644 --- a/tests/transform_dialect/cuda/softmax_dispatch_spec.mlir +++ b/tests/transform_dialect/cuda/softmax_dispatch_spec.mlir
@@ -13,7 +13,7 @@ /// This must be used with the custom dispatch region formation /// because IREE's does not fuse the 6 ops softmax version even with - /// --iree-flow-enable-aggressive-fusion. + /// --iree-flow-fuse-multi-use. %region_op = transform.iree.wrap_in_dispatch_region %div { generateWorkload = false } %non_div = transform.merge_handles %input_max_fill, %input_max, %exps_sum_fill, %exps, %exps_sum
diff --git a/tests/transform_dialect/cuda/softmax_v2.mlir b/tests/transform_dialect/cuda/softmax_v2.mlir index d72bde6..76ebb0d 100644 --- a/tests/transform_dialect/cuda/softmax_v2.mlir +++ b/tests/transform_dialect/cuda/softmax_v2.mlir
@@ -1,7 +1,7 @@ // RUN: iree-opt %s --iree-hal-target-backends=cuda \ // RUN: --iree-abi-transformation-pipeline \ // RUN: --iree-flow-transformation-pipeline \ -// RUN: --iree-flow-enable-aggressive-fusion \ +// RUN: --iree-flow-fuse-multi-use \ // RUN: --iree-stream-transformation-pipeline \ // RUN: --iree-hal-configuration-pipeline | \ // RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-llvmgpu-lower-executable-target)))' \ @@ -13,7 +13,7 @@ // RUN: --iree-opt-const-expr-hoisting=false --iree-opt-const-eval=false \ /// Constant JIT'ing must be disabled because the transform-dialect debug /// flags leak to the JIT session, which doesn't know what to do with them. -// RUN: --iree-flow-enable-aggressive-fusion \ +// RUN: --iree-flow-fuse-multi-use \ // RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \ // RUN: --iree-codegen-llvmgpu-use-transform-dialect=%p/softmax_v2_codegen_spec.mlir | \ // RUN: iree-run-module --function=softmax --device=cuda | \