Update the usage of the transform dialect interpreter (#15340)
Upstream is shifting to using the simpler form of the transform dialect
interpreter introduced in
https://github.com/llvm/llvm-project/pull/68661
Update IREE's codegen usage of the interpreter as well as tests which
now require a named sequence entry point.
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel
index 7acdd4f..53d8c78 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel
@@ -6,8 +6,8 @@
# Tests for common transforms.
-load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
load("//build_tools/bazel:enforce_glob.bzl", "enforce_glob")
+load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
package(
features = ["layering_check"],
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/transform_gpu_workgroup_swizzle.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/transform_gpu_workgroup_swizzle.mlir
index fbed641..1982183 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/transform_gpu_workgroup_swizzle.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/transform_gpu_workgroup_swizzle.mlir
@@ -28,11 +28,15 @@
return
}
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
- %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
- transform.iree.workgroup_swizzle %0 { log_tile = 3 } : (!transform.any_op) -> ()
-}
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(
+ %variant_op: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ transform.iree.workgroup_swizzle %0 { log_tile = 3 } : (!transform.any_op) -> ()
+ transform.yield
+ }
+} // module
+
// CHECK-LABEL: func.func @matmul
// CHECK: %[[WORKGROUPIDX:.*]] = hal.interface.workgroup.id[0] : index
diff --git a/compiler/src/iree/compiler/Codegen/Common/MaterializeUserConfigs.cpp b/compiler/src/iree/compiler/Codegen/Common/MaterializeUserConfigs.cpp
index 918b43a..88e9a15 100644
--- a/compiler/src/iree/compiler/Codegen/Common/MaterializeUserConfigs.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/MaterializeUserConfigs.cpp
@@ -4,29 +4,35 @@
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include <iterator>
#include "iree/compiler/Codegen/Common/PassDetail.h"
#include "iree/compiler/Codegen/Common/Passes.h"
#include "iree/compiler/Codegen/Common/UserConfig.h"
#include "iree/compiler/Codegen/Dialect/IREECodegenAttrs.h"
#include "iree/compiler/Codegen/Dialect/IREECodegenDialect.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/iterator_range.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/Transform/IR/TransformDialect.h"
#include "mlir/Dialect/Transform/IR/TransformOps.h"
#include "mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#define DEBUG_TYPE "iree-codegen-materialize-library-calls"
+#define DEBUG_TYPE "iree-codegen-materialize-user-configs"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
namespace mlir {
namespace iree_compiler {
-llvm::cl::opt<std::string> clCodegenTransformDialectTestName(
+llvm::cl::opt<std::string> clCodegenTransformDialectStrategyName(
"iree-codegen-use-transform-dialect-strategy",
llvm::cl::desc(
- "Broadcasts the given transform dialect strategy specification to all"
- "dispatches. Supports two modes; a path to the MLIR file containing a"
- "transform dialect specification to apply, and a symbol reference to"
- "load from a library of transform specs (@library_call)"),
+ "Broadcasts the given transform dialect strategy specification to all "
+ "dispatches. The specification is a symbol reference to load from a"
+ "library of transform specs (@library_call)"),
llvm::cl::init(""));
llvm::cl::opt<std::string> clCodegenTransformDialectLibraryFileName(
@@ -40,25 +46,6 @@
static const char kTranslationInfoAttrName[] = "translation_info";
-static void createEmptyTransformStrategy(ModuleOp innerModule) {
- Location loc = innerModule.getLoc();
- OpBuilder b = OpBuilder::atBlockEnd(innerModule.getBody());
- auto topLevelTransformModule = b.create<ModuleOp>(loc);
- Region &topLevelTransformRegion = topLevelTransformModule.getBodyRegion();
- b.setInsertionPointToStart(&topLevelTransformRegion.front());
- auto anyOpType = transform::AnyOpType::get(b.getContext());
-
- // Create the include for the named sequence with the expectation that the
- // external definition will be linked in later.
- auto sequence = b.create<transform::SequenceOp>(
- loc, TypeRange{}, transform::FailurePropagationMode::Propagate, anyOpType,
- [&](OpBuilder &b, Location loc, Value variantH) {
- b.create<transform::PrintOp>(loc, variantH);
- b.create<transform::YieldOp>(loc);
- });
- (void)sequence;
-}
-
struct MaterializeUserConfigsPass
: public MaterializeUserConfigsBase<MaterializeUserConfigsPass> {
void getDependentDialects(DialectRegistry ®istry) const override {
@@ -72,6 +59,7 @@
getAllEntryPoints(moduleOp);
MLIRContext *context = moduleOp.getContext();
+ LDBG("MaterializeUserConfigsPass on variant: " << variantOp);
std::optional<ModuleOp> transformLibrary = std::nullopt;
if (!clCodegenTransformDialectLibraryFileName.empty()) {
auto dialect =
@@ -79,9 +67,13 @@
auto maybeTransformLibrary = dialect->getOrLoadTransformLibraryModule(
clCodegenTransformDialectLibraryFileName);
if (failed(maybeTransformLibrary)) {
+ variantOp.emitError() << "failed to load transform library module: "
+ << clCodegenTransformDialectLibraryFileName;
return signalPassFailure();
}
transformLibrary = *maybeTransformLibrary;
+ LDBG("--found transform library @"
+ << clCodegenTransformDialectLibraryFileName);
}
IREE::Codegen::DispatchLoweringPassPipeline tdPipeline =
@@ -89,19 +81,27 @@
std::optional<IREE::Codegen::TranslationInfoAttr> clTranslationInfo;
// Here we always set the pipeline strategy to transform dialect if the
// flag is non-empty to ensure we pick the right lowering pipeline in the
- // event a file path is given.
- if (!clCodegenTransformDialectTestName.empty()) {
+ // event a strategy symbol is defined.
+ if (!clCodegenTransformDialectLibraryFileName.empty() ||
+ !clCodegenTransformDialectStrategyName.empty()) {
+ StringRef strategyName =
+ (clCodegenTransformDialectStrategyName.empty())
+ ? StringRef(
+ transform::TransformDialect::kTransformEntryPointSymbolName)
+ : clCodegenTransformDialectStrategyName;
clTranslationInfo = IREE::Codegen::TranslationInfoAttr::get(
context, tdPipeline,
/*softwarePipelineDepth=*/0,
/*softwarePipelineStoreStage=*/1,
- /*codegenSpec=*/clCodegenTransformDialectTestName[0] == '@'
- ? SymbolRefAttr::get(
- context, llvm::StringRef(
- clCodegenTransformDialectTestName.substr(1)))
- : SymbolRefAttr());
+ /*codegenSpec=*/
+ SymbolRefAttr::get(context, llvm::StringRef(strategyName)));
+ LDBG("--clTranslationInfo: " << clTranslationInfo);
}
+ LDBG("--start iterating over: "
+ << std::distance(moduleOp.getOps<func::FuncOp>().begin(),
+ moduleOp.getOps<func::FuncOp>().end())
+ << " functions");
std::optional<IREE::Codegen::TranslationInfoAttr> translationInfo;
for (auto funcOp : moduleOp.getOps<func::FuncOp>()) {
auto exportOp = exportOps.lookup(funcOp.getName());
@@ -131,7 +131,7 @@
/// Currently codegen is rooted on the variant, meaning every entry
/// must go through the same codegen pipeline. For multi-targeting we
/// will want to have multiple functions per variant, as well as
- /// multple exports per variant, meaning eventually the nesting of
+ /// multiple exports per variant, meaning eventually the nesting of
/// the translation pipeline will need to change to the function, or
/// we'll need another level of module op nesting.
if (exportedTranslationInfo != translationInfo.value()) {
@@ -160,6 +160,7 @@
}
}
+ LDBG("--guaranteed unique translationInfo: " << translationInfo);
/// We only need to resolve symbols for transform dialect based strategies.
if (!translationInfo ||
translationInfo.value().getDispatchLoweringPassPipeline() !=
@@ -167,52 +168,38 @@
return;
}
- std::optional<SymbolRefAttr> libraryFunc =
+ // From now on, we know we have a transform dialect strategy. We now need to
+ // ensure it can resolve and apply in a subsequent interpreter pass or else
+ // we need to fall back to codegen.
+ bool failedToResolve = false;
+ auto g = llvm::make_scope_exit([&]() {
+ if (!failedToResolve)
+ return;
+
+ exportOps = getAllEntryPoints(variantOp.getInnerModule());
+ for (auto &it : exportOps) {
+ auto exportOp = it.second;
+ if (getTranslationInfo(exportOp) == translationInfo) {
+ exportOp->removeAttr(kTranslationInfoAttrName);
+ }
+ }
+ });
+
+ std::optional<SymbolRefAttr> strategyName =
translationInfo.value().getCodegenSpec();
- if (!libraryFunc || *libraryFunc == SymbolRefAttr()) {
+ if (!strategyName || *strategyName == SymbolRefAttr()) {
+ failedToResolve = true;
return;
}
/// If we have a symbol, verify the existence of the symbol within the
/// transform library.
+ StringRef entryPoint = strategyName->getLeafReference();
if (!transformLibrary || !(*transformLibrary) ||
!transform::detail::findTransformEntryPoint(
- variantOp, *transformLibrary, libraryFunc->getLeafReference())) {
+ variantOp, *transformLibrary, entryPoint)) {
moduleOp.emitOpError("failed to find transform strategy symbol");
- return signalPassFailure();
- }
-
- // TODO: At this point we could allow the user to (optionally) return a
- // translation info attribute to use, however there currently isn't a way
- // upstream to retrieve the results of the named sequence.
-
- /// Attempt to execute the strategy. symbol (from the flag or otherwise) at
- /// the same time. Because the strategy is rooted on the variant op, the
- /// strategy can change the translation info on the exports if needed, else
- /// back to default IREE codegen.
- StringRef entryPoint = libraryFunc->getLeafReference();
- Operation *transformRoot = transform::detail::findTransformEntryPoint(
- variantOp, *transformLibrary, entryPoint);
- if (!transformRoot) {
- return;
- }
- if (failed(transform::applyTransformNamedSequence(
- variantOp, transformRoot, *transformLibrary,
- options.enableExpensiveChecks(true)))) {
- return signalPassFailure();
- }
-
- // Re-retrieve the export ops and mark all exports with unchanged
- // translation info as un-translated.
- // TODO: Currently this is the only way to "fall back" to codegen. If the
- // user wants to do all of codegen themselves they can set a `None`
- // pipeline.
- exportOps = getAllEntryPoints(variantOp.getInnerModule());
- for (auto &it : exportOps) {
- auto exportOp = it.second;
- if (getTranslationInfo(exportOp) == translationInfo) {
- exportOp->removeAttr(kTranslationInfoAttrName);
- }
+ failedToResolve = true;
}
}
diff --git a/compiler/src/iree/compiler/Codegen/Common/Passes.h b/compiler/src/iree/compiler/Codegen/Common/Passes.h
index ec5ab2c..0db187d 100644
--- a/compiler/src/iree/compiler/Codegen/Common/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/Common/Passes.h
@@ -252,10 +252,7 @@
/// Create an IREE-specific Transform dialect interpreter pass with all
/// registrations necessary for IREE.
-std::unique_ptr<Pass> createTransformDialectInterpreterPass(
- llvm::StringRef transformFileName = llvm::StringRef(),
- llvm::StringRef debugPayloadRootTag = llvm::StringRef(),
- llvm::StringRef debugTransformRootTag = llvm::StringRef());
+std::unique_ptr<Pass> createTransformDialectInterpreterPass();
/// Pass to propagate type to avoid generating load/stores of illegal types.
std::unique_ptr<OperationPass<func::FuncOp>> createTypePropagationPass();
diff --git a/compiler/src/iree/compiler/Codegen/Common/Passes.td b/compiler/src/iree/compiler/Codegen/Common/Passes.td
index 0aefa61..4ca6907 100644
--- a/compiler/src/iree/compiler/Codegen/Common/Passes.td
+++ b/compiler/src/iree/compiler/Codegen/Common/Passes.td
@@ -453,35 +453,18 @@
let summary = "Pass to apply transform dialect operations.";
let constructor =
"mlir::iree_compiler::createTransformDialectInterpreterPass()";
+ let description = [{
+ This pass runs the transform dialect interpreter and applies the named
+ sequence transformation specified by the provided name (defaults to
+ `TransformDialect::kTransformEntryPointSymbolName` (i.e. `__transform_main`)).
+ }];
let options = [
- Option<"transformFileName", "transform-file-name", "std::string",
- /*default=*/"\"\"",
- "Optional filename containing a transform dialect specification to "
- "apply. If left empty, the IR is assumed to contain one top-level "
- "transform dialect operation somewhere in the module.">,
- ListOption<"transformLibraryPaths",
- "transform-library-paths",
- "std::string",
- "If non-empty, the paths to files containing definitions of "
- "external symbols referenced in the transform script. "
- "These definitions will be used to replace declarations.">,
- Option<"debugPayloadRootTag", "debug-payload-root-tag", "std::string",
- /*default=*/"\"\"",
- "Select the operation with 'transform.target_tag' attribute having "
- "the given value as payload IR root. This allows user control on "
- "what operation to transform in debug mode, without requiring "
- "intimate knowledge of the IREE nested pass pipeline.\\n"
- "If empty (normal operation mode), select the pass anchor "
- "operation in the IREE pipeline, as the payload IR root.">,
- Option<"debugTransformRootTag", "debug-transform-root-tag", "std::string",
- /*default=*/"\"\"",
- "Select the operation with 'transform.target_tag' attribute having "
- "the given value as container IR for top-level transform ops. This "
- "allows user control on what transformation to apply in debug "
- "mode, without requiring intimate knowledge of the IREE nested "
- "pass pipeline.\\n"
- "If empty (normal operation mode), select the container of the "
- "top-level transform op.">
+ Option<"entryPoint", "entry-point", "std::string",
+ /*default=*/[{"transform::TransformDialect::kTransformEntryPointSymbolName.str()"}],
+ "Entry point of the pass pipeline.">,
+ Option<"libraryFileName", "library-file-name", "std::string",
+ /*default=*/[{""}],
+ "File path to load a library of transform dialect strategies from.">,
];
}
diff --git a/compiler/src/iree/compiler/Codegen/Common/TransformDialectInterpreterPass.cpp b/compiler/src/iree/compiler/Codegen/Common/TransformDialectInterpreterPass.cpp
index 1b95c7d..237dc20 100644
--- a/compiler/src/iree/compiler/Codegen/Common/TransformDialectInterpreterPass.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/TransformDialectInterpreterPass.cpp
@@ -6,6 +6,7 @@
#include "iree/compiler/Codegen/Common/PassDetail.h"
#include "iree/compiler/Codegen/Common/Passes.h"
+#include "iree/compiler/Codegen/Dialect/IREECodegenDialect.h"
#include "mlir/Dialect/Transform/IR/TransformDialect.h"
#include "mlir/Dialect/Transform/Transforms/TransformInterpreterPassBase.h"
#include "mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h"
@@ -20,62 +21,66 @@
/// This needs to be its own pass because the registration mechanism and ops
/// available are different than for other interpreters.
class TransformDialectInterpreterPass
- : public mlir::transform::TransformInterpreterPassBase<
- TransformDialectInterpreterPass,
- iree_compiler::TransformDialectInterpreterBase> {
+ : public iree_compiler::TransformDialectInterpreterBase<
+ TransformDialectInterpreterPass> {
public:
+ TransformDialectInterpreterPass(StringRef libraryFileName = StringRef(),
+ StringRef entryPoint = StringRef()) {
+ this->libraryFileName = libraryFileName.str();
+ this->entryPoint = entryPoint.str();
+ }
+
void getDependentDialects(DialectRegistry ®istry) const override {
mlir::iree_compiler::registerTransformDialectTranslationDependentDialects(
registry);
}
- // We don't register libraries here because we expect them to be pre-loaded
- // much earlier on in the compiler pipeline.
- TransformDialectInterpreterPass(
- StringRef transformFileName = StringRef(),
- StringRef debugPayloadRootTag = StringRef(),
- StringRef debugTransformRootTag = StringRef()) {
- this->transformFileName = transformFileName.str();
- this->debugPayloadRootTag = debugPayloadRootTag.str();
- this->debugTransformRootTag = debugTransformRootTag.str();
+ void runOnOperation() override {
+ MLIRContext *context = &getContext();
+ transform::TransformOptions options;
+ if (entryPoint.empty()) {
+ entryPoint =
+ transform::TransformDialect::kTransformEntryPointSymbolName.str();
+ }
+ auto dialect = context->getOrLoadDialect<
+ mlir::iree_compiler::IREE::Codegen::IREECodegenDialect>();
+ FailureOr<ModuleOp> maybeTransformLibrary;
+ if (!libraryFileName.empty()) {
+ maybeTransformLibrary =
+ dialect->getOrLoadTransformLibraryModule(libraryFileName);
+ }
+
+ Operation *payloadRoot = getOperation();
+ ModuleOp transformModule =
+ succeeded(maybeTransformLibrary) ? *maybeTransformLibrary : ModuleOp();
+ Operation *transformEntryPoint = transform::detail::findTransformEntryPoint(
+ getOperation(), transformModule, entryPoint);
+ if (!transformEntryPoint) {
+ Operation *transformModuleOrPayloadRoot =
+ transformModule ? transformModule : payloadRoot;
+ transformModuleOrPayloadRoot->emitError()
+ << "failed to find transform entry point '" << entryPoint << "'";
+ return signalPassFailure();
+ }
+ if (failed(transform::applyTransformNamedSequence(
+ payloadRoot, transformEntryPoint, transformModule,
+ options.enableExpensiveChecks(true))))
+ return signalPassFailure();
}
- TransformDialectInterpreterPass(const TransformDialectInterpreterPass &pass) =
- default;
};
} // namespace
namespace mlir {
namespace iree_compiler {
-extern llvm::cl::opt<std::string> clCodegenTransformDialectTestName;
-static llvm::cl::opt<std::string> clCodegenTransformDialectDebugPayloadTag(
- "iree-codegen-transform-dialect-debug-payload-tag",
- llvm::cl::desc("tag attribute value for the transform dialect interpreter "
- "payload root operation"),
- llvm::cl::init(""));
-static llvm::cl::opt<std::string> clCodegenTransformDialectDebugTransformTag(
- "iree-codegen-transform-dialect-debug-transform-tag",
- llvm::cl::desc(
- "tag attribute value for the transform dialect transform op container"),
- llvm::cl::init(""));
+extern llvm::cl::opt<std::string> clCodegenTransformDialectStrategyName;
+extern llvm::cl::opt<std::string> clCodegenTransformDialectLibraryFileName;
/// Create a Transform dialect interpreter pass.
-std::unique_ptr<Pass>
-createTransformDialectInterpreterPass(llvm::StringRef transformFileName,
- llvm::StringRef debugPayloadRootTag,
- llvm::StringRef debugTransformRootTag) {
- // If the strategy filename is prefixed with `@`, it refers to a library
- // call.
- std::string clFileName = !clCodegenTransformDialectTestName.empty() &&
- clCodegenTransformDialectTestName[0] != '@'
- ? clCodegenTransformDialectTestName
- : std::string();
+std::unique_ptr<Pass> createTransformDialectInterpreterPass() {
return std::make_unique<TransformDialectInterpreterPass>(
- transformFileName.empty() ? clFileName : transformFileName,
- debugPayloadRootTag.empty() ? clCodegenTransformDialectDebugPayloadTag
- : debugPayloadRootTag,
- debugTransformRootTag.empty() ? clCodegenTransformDialectDebugTransformTag
- : debugTransformRootTag);
+ clCodegenTransformDialectLibraryFileName,
+ clCodegenTransformDialectStrategyName);
}
} // namespace iree_compiler
} // namespace mlir
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/test/BUILD.bazel
index 82e5a1c..431e851 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Common/test/BUILD.bazel
@@ -60,7 +60,6 @@
"test_partitionable_loops_interface.mlir",
"tile_and_distribute_to_workgroups.mlir",
"transform_buffer_opt.mlir",
- "transform_dialect_apply_pattern_op.mlir",
"transform_match_partial_reduction.mlir",
"transform_ops_invalid.mlir",
"transpose_canonicalization.mlir",
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/test/CMakeLists.txt
index 8b2e512..3ef3060 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Common/test/CMakeLists.txt
@@ -56,7 +56,6 @@
"test_partitionable_loops_interface.mlir"
"tile_and_distribute_to_workgroups.mlir"
"transform_buffer_opt.mlir"
- "transform_dialect_apply_pattern_op.mlir"
"transform_match_partial_reduction.mlir"
"transform_ops_invalid.mlir"
"transpose_canonicalization.mlir"
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/batch_matmul_match_spec.mlir b/compiler/src/iree/compiler/Codegen/Common/test/batch_matmul_match_spec.mlir
index 302aabd..815e4f7 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/batch_matmul_match_spec.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/batch_matmul_match_spec.mlir
@@ -1,9 +1,11 @@
// RUN: iree-opt %s
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
- transform.iree.register_match_callbacks
- %0:2 = transform.iree.match_callback failures(propagate) "batch_matmul"(%arg0) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.iree.emit_remark "fill" at %0#0 : !transform.any_op
- transform.iree.emit_remark "batch matmul" at %0#1 : !transform.any_op
-}
+module attributes { transform.with_named_sequence } {
+transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
+ transform.iree.register_match_callbacks
+ %0:2 = transform.iree.match_callback failures(propagate) "batch_matmul"(%root) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.iree.emit_remark "fill" at %0#0 : !transform.any_op
+ transform.iree.emit_remark "batch matmul" at %0#1 : !transform.any_op
+ transform.yield
+ } // @__transform_main
+} // module
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/batch_matmuls.mlir b/compiler/src/iree/compiler/Codegen/Common/test/batch_matmuls.mlir
index bfeb9a6..2d91350 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/batch_matmuls.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/batch_matmuls.mlir
@@ -1,4 +1,7 @@
-// RUN: iree-opt %s --iree-transform-dialect-interpreter='transform-file-name=%p/batch_matmul_match_spec.mlir' --split-input-file --verify-diagnostics
+// RUN: iree-opt %s \
+// RUN: --iree-codegen-transform-dialect-library=%p/batch_matmul_match_spec.mlir \
+// RUN: --iree-transform-dialect-interpreter \
+// RUN: --split-input-file --verify-diagnostics
!lhs = tensor<128x80x32xf32>
!rhs = tensor<128x32x320xf32>
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/convolution_match_spec.mlir b/compiler/src/iree/compiler/Codegen/Common/test/convolution_match_spec.mlir
index 52ea94b..898b091 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/convolution_match_spec.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/convolution_match_spec.mlir
@@ -1,14 +1,16 @@
// RUN: iree-opt %s
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
- transform.iree.register_match_callbacks
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
+ transform.iree.register_match_callbacks
- %fill, %convolution, %trailing =
- transform.iree.match_callback failures(propagate) "convolution"(%arg0)
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+ %fill, %convolution, %trailing =
+ transform.iree.match_callback failures(propagate) "convolution"(%root)
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
- transform.iree.emit_remark "fill" at %fill : !transform.any_op
- transform.iree.emit_remark "convolution" at %convolution : !transform.any_op
- transform.iree.emit_remark "trailing" at %trailing : !transform.any_op
-}
+ transform.iree.emit_remark "fill" at %fill : !transform.any_op
+ transform.iree.emit_remark "convolution" at %convolution : !transform.any_op
+ transform.iree.emit_remark "trailing" at %trailing : !transform.any_op
+ transform.yield
+ } // @__transform_main
+} // module
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/convolutions.mlir b/compiler/src/iree/compiler/Codegen/Common/test/convolutions.mlir
index 5a724b2..be4bb2f 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/convolutions.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/convolutions.mlir
@@ -1,4 +1,8 @@
-// RUN: iree-opt %s --iree-transform-dialect-interpreter='transform-file-name=%p/convolution_match_spec.mlir' --split-input-file --verify-diagnostics
+// RUN: iree-opt %s \
+// RUN: --iree-codegen-transform-dialect-library=%p/convolution_match_spec.mlir \
+// RUN: --iree-transform-dialect-interpreter \
+// RUN: --split-input-file --verify-diagnostics
+
!input_tensor_t = tensor<2x16x130x130xf32>
!weight_tensor_t = tensor<32x16x3x3xf32>
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/reductions.mlir b/compiler/src/iree/compiler/Codegen/Common/test/reductions.mlir
index 289bb02..3459164 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/reductions.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/reductions.mlir
@@ -1,5 +1,5 @@
-// RUN: iree-opt %s --iree-transform-dialect-interpreter='transform-file-name=%p/reductions_codegen_spec.mlir' --split-input-file | FileCheck %s
-// RUN: iree-opt %s --iree-transform-dialect-interpreter='transform-file-name=%p/reductions_match_spec.mlir' --split-input-file --verify-diagnostics
+// RUN: iree-opt %s --iree-codegen-transform-dialect-library=%p/reductions_codegen_spec.mlir --iree-transform-dialect-interpreter --split-input-file | FileCheck %s
+// RUN: iree-opt %s --iree-codegen-transform-dialect-library=%p/reductions_match_spec.mlir --iree-transform-dialect-interpreter --split-input-file --verify-diagnostics
// Check that the same transform script applies to reductions with optional
// leading and trailing elementwise operations, potentially reordered
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/reductions_codegen_spec.mlir b/compiler/src/iree/compiler/Codegen/Common/test/reductions_codegen_spec.mlir
index 93e5e39..d2a67aa 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/reductions_codegen_spec.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/reductions_codegen_spec.mlir
@@ -1,75 +1,77 @@
// RUN: iree-opt %s
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
- transform.iree.register_match_callbacks
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
+ transform.iree.register_match_callbacks
- %maybe_leading, %original_fill, %reduction, %maybe_trailing_0 =
- transform.iree.match_callback failures(propagate) "reduction"(%arg0)
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-
- %_, %more_parallel_fill, %parallel_reduction, %combiner_op =
- transform.structured.split_reduction %reduction { split_factor = 2, insert_split_dimension = 1 }
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+ %maybe_leading, %original_fill, %reduction, %maybe_trailing_0 =
+ transform.iree.match_callback failures(propagate) "reduction"(%root)
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+
+ %_, %more_parallel_fill, %parallel_reduction, %combiner_op =
+ transform.structured.split_reduction %reduction { split_factor = 2, insert_split_dimension = 1 }
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
- // Step 1. Map to a single block by tiling with size 1 and fusing.
- %fusion_root_1, %fusion_group_1 = transform.iree.take_first %maybe_trailing_0, %combiner_op
- : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
- %outer_tiled, %grid_loop = transform.structured.tile_using_forall %fusion_root_1 tile_sizes [1]
- ( mapping = [#gpu.block<x>] )
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
- %func = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %func {
- transform.apply_patterns.iree.bubble_expand
- } : !transform.any_op
+ // Step 1. Map to a single block by tiling with size 1 and fusing.
+ %fusion_root_1, %fusion_group_1 = transform.iree.take_first %maybe_trailing_0, %combiner_op
+ : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ %outer_tiled, %grid_loop = transform.structured.tile_using_forall %fusion_root_1 tile_sizes [1]
+ ( mapping = [#gpu.block<x>] )
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+ %func = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %func {
+ transform.apply_patterns.iree.bubble_expand
+ } : !transform.any_op
- // Excessively eager canonicalization results in `fill`s being "fused" due to
- // swapping with `extract_slice`, which confuses the fusion operation below.
- // Wrap fusion into a non-canonicalized sequence.
- %fused_2, %parallel_reduction_2, %more_parallel_fill_2, %original_fill_2, %maybe_leading_2 =
- transform.sequence %arg0 : !transform.any_op -> !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op
- failures(propagate) {
- ^bb1(%arg1: !transform.any_op):
- %fused_22, %new_containing_1 = transform.structured.fuse_into_containing_op %fusion_group_1 into %grid_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
- %parallel_reduction_22, %new_containing_2 = transform.structured.fuse_into_containing_op %parallel_reduction into %grid_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
- %more_parallel_fill_22, %new_containing_3 = transform.structured.fuse_into_containing_op %more_parallel_fill into %grid_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
- %original_fill_22, %new_containing_4 = transform.structured.fuse_into_containing_op %original_fill into %grid_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
- %maybe_leading_22, %new_containing_5 = transform.structured.fuse_into_containing_op %maybe_leading into %grid_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ // Excessively eager canonicalization results in `fill`s being "fused" due to
+ // swapping with `extract_slice`, which confuses the fusion operation below.
+ // Wrap fusion into a non-canonicalized sequence.
+ %fused_2, %parallel_reduction_2, %more_parallel_fill_2, %original_fill_2, %maybe_leading_2 =
+ transform.sequence %root : !transform.any_op -> !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op
+ failures(propagate) {
+ ^bb1(%arg1: !transform.any_op):
+ %fused_22, %new_containing_1 = transform.structured.fuse_into_containing_op %fusion_group_1 into %grid_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ %parallel_reduction_22, %new_containing_2 = transform.structured.fuse_into_containing_op %parallel_reduction into %grid_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ %more_parallel_fill_22, %new_containing_3 = transform.structured.fuse_into_containing_op %more_parallel_fill into %grid_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ %original_fill_22, %new_containing_4 = transform.structured.fuse_into_containing_op %original_fill into %grid_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ %maybe_leading_22, %new_containing_5 = transform.structured.fuse_into_containing_op %maybe_leading into %grid_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.yield %fused_22, %parallel_reduction_22, %more_parallel_fill_22, %original_fill_22, %maybe_leading_22
- : !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op
- }
+ transform.yield %fused_22, %parallel_reduction_22, %more_parallel_fill_22, %original_fill_22, %maybe_leading_22
+ : !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op
+ }
- // Step 2. Map reduction to thread X and parallel dimension to other threads.
- // ===========================================================================
- %fusion_group_22_full = transform.merge_handles %fused_2, %original_fill_2
- : !transform.any_op
- %fusion_root_22_tiled, %block_loop_22 =
- transform.structured.tile_using_forall %outer_tiled
- tile_sizes [1] ( mapping = [#gpu.thread<z>] )
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.structured.fuse_into_containing_op %fusion_group_22_full into %block_loop_22 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
+ // Step 2. Map reduction to thread X and parallel dimension to other threads.
+ // ===========================================================================
+ %fusion_group_22_full = transform.merge_handles %fused_2, %original_fill_2
+ : !transform.any_op
+ %fusion_root_22_tiled, %block_loop_22 =
+ transform.structured.tile_using_forall %outer_tiled
+ tile_sizes [1] ( mapping = [#gpu.thread<z>] )
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.structured.fuse_into_containing_op %fusion_group_22_full into %block_loop_22 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+
- %fusion_group_21 = transform.merge_handles %maybe_leading_2, %more_parallel_fill_2
- : !transform.any_op
- %fusion_root_21_tiled, %block_loop_21 =
- transform.structured.tile_using_forall %parallel_reduction_2
- tile_sizes [1, 1] ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.structured.fuse_into_containing_op %fusion_group_21 into %block_loop_21 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
- // Step 3. Rank-reduce.
- // ===========================================================================
- transform.apply_patterns to %func {
- transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
- transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
- transform.apply_patterns.vector.cast_away_vector_leading_one_dim
- } : !transform.any_op
+ %fusion_group_21 = transform.merge_handles %maybe_leading_2, %more_parallel_fill_2
+ : !transform.any_op
+ %fusion_root_21_tiled, %block_loop_21 =
+ transform.structured.tile_using_forall %parallel_reduction_2
+ tile_sizes [1, 1] ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.structured.fuse_into_containing_op %fusion_group_21 into %block_loop_21 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+ // Step 3. Rank-reduce.
+ // ===========================================================================
+ transform.apply_patterns to %func {
+ transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+ transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+ transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+ } : !transform.any_op
- // We don't perform any following transformation (vectorization, bufferizaton,
- // mapping) because this schedule is applied to Linalg-only code without the
- // surrounding context and because it would make it difficult to detect, e.g.,
- // lack of fusion.
-}
+ // We don't perform any following transformation (vectorization, bufferizaton,
+ // mapping) because this schedule is applied to Linalg-only code without the
+ // surrounding context and because it would make it difficult to detect, e.g.,
+ // lack of fusion.
+ transform.yield
+ } // @__transform_main
+} // module
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/reductions_match_spec.mlir b/compiler/src/iree/compiler/Codegen/Common/test/reductions_match_spec.mlir
index 3de0a24..7f19631 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/reductions_match_spec.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/reductions_match_spec.mlir
@@ -1,15 +1,17 @@
// RUN: iree-opt %s
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
- transform.iree.register_match_callbacks
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
+ transform.iree.register_match_callbacks
- %leading, %fill, %reduction, %trailing =
- transform.iree.match_callback failures(propagate) "reduction"(%arg0)
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+ %leading, %fill, %reduction, %trailing =
+ transform.iree.match_callback failures(propagate) "reduction"(%root)
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
- transform.iree.emit_remark "leading" at %leading : !transform.any_op
- transform.iree.emit_remark "fill" at %fill : !transform.any_op
- transform.iree.emit_remark "reduction" at %reduction : !transform.any_op
- transform.iree.emit_remark "trailing" at %trailing : !transform.any_op
-}
+ transform.iree.emit_remark "leading" at %leading : !transform.any_op
+ transform.iree.emit_remark "fill" at %fill : !transform.any_op
+ transform.iree.emit_remark "reduction" at %reduction : !transform.any_op
+ transform.iree.emit_remark "trailing" at %trailing : !transform.any_op
+ transform.yield
+ } // @__transform_main
+} // module
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/repeated_matcher_use.mlir b/compiler/src/iree/compiler/Codegen/Common/test/repeated_matcher_use.mlir
index 783ad3d..3193304 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/repeated_matcher_use.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/repeated_matcher_use.mlir
@@ -1,239 +1,241 @@
-// RUN: iree-opt %s --iree-transform-dialect-interpreter --verify-diagnostics --split-input-file
+// RUN: iree-opt %s \
+// RUN: --iree-transform-dialect-interpreter \
+// RUN: --split-input-file --verify-diagnostics
-module {
- transform.sequence failures(propagate) {
- ^bb0(%arg0: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
transform.iree.register_match_callbacks
%first, %second =
- transform.iree.match_callback failures(propagate) "_test_repeated_matcher_use_callback"(%arg0)
+ transform.iree.match_callback failures(propagate) "_test_repeated_matcher_use_callback"(%root)
: (!transform.any_op) -> (!transform.any_op, !transform.any_op)
transform.iree.emit_remark "first" at %first : !transform.any_op
transform.iree.emit_remark "second" at %second : !transform.any_op
- }
+ transform.yield
+ } // @__transform_main
+} // module
- module {
- func.func private @f1(f32) -> f32
- func.func private @f2(f32, f32) -> f32
+module {
+ func.func private @f1(f32) -> f32
+ func.func private @f2(f32, f32) -> f32
- func.func @foo() -> tensor<10xf32> {
- %dummy1 = tensor.empty() : tensor<10xf32>
- %dummy2 = tensor.empty() : tensor<10xf32>
- %dummy3 = tensor.empty() : tensor<10xf32>
- %c0 = arith.constant 0.0 : f32
- %operand = linalg.fill ins(%c0 : f32) outs(%dummy1 : tensor<10xf32>) -> tensor<10xf32>
-
- // expected-remark @below {{first}}
- %first = linalg.generic {
- indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
- iterator_types = ["parallel"]
- } ins(%operand : tensor<10xf32>)
- outs(%dummy2 : tensor<10xf32>) {
- ^bb0(%arg0: f32, %arg1: f32):
- %0 = func.call @f1(%arg0) : (f32) -> f32
- linalg.yield %0 : f32
- } -> tensor<10xf32>
+ func.func @foo() -> tensor<10xf32> {
+ %dummy1 = tensor.empty() : tensor<10xf32>
+ %dummy2 = tensor.empty() : tensor<10xf32>
+ %dummy3 = tensor.empty() : tensor<10xf32>
+ %c0 = arith.constant 0.0 : f32
+ %operand = linalg.fill ins(%c0 : f32) outs(%dummy1 : tensor<10xf32>) -> tensor<10xf32>
+
+ // expected-remark @below {{first}}
+ %first = linalg.generic {
+ indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
+ iterator_types = ["parallel"]
+ } ins(%operand : tensor<10xf32>)
+ outs(%dummy2 : tensor<10xf32>) {
+ ^bb0(%arg0: f32, %arg1: f32):
+ %0 = func.call @f1(%arg0) : (f32) -> f32
+ linalg.yield %0 : f32
+ } -> tensor<10xf32>
- // expected-remark @below {{second}}
- %second = linalg.generic {
- indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
- iterator_types = ["parallel"]
- } ins(%operand, %first : tensor<10xf32>, tensor<10xf32>)
- outs(%dummy3 : tensor<10xf32>) {
- ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
- %0 = func.call @f2(%arg0, %arg1) : (f32, f32) -> f32
- linalg.yield %0 : f32
- } -> tensor<10xf32>
- return %second : tensor<10xf32>
- }
+ // expected-remark @below {{second}}
+ %second = linalg.generic {
+ indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
+ iterator_types = ["parallel"]
+ } ins(%operand, %first : tensor<10xf32>, tensor<10xf32>)
+ outs(%dummy3 : tensor<10xf32>) {
+ ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
+ %0 = func.call @f2(%arg0, %arg1) : (f32, f32) -> f32
+ linalg.yield %0 : f32
+ } -> tensor<10xf32>
+ return %second : tensor<10xf32>
}
}
// -----
-module {
- transform.sequence failures(propagate) {
- ^bb0(%arg0: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
transform.iree.register_match_callbacks
// expected-error @+2 {{failed to match}}
%first, %second =
- transform.iree.match_callback failures(propagate) "_test_repeated_matcher_use_callback"(%arg0)
+ transform.iree.match_callback failures(propagate) "_test_repeated_matcher_use_callback"(%root)
: (!transform.any_op) -> (!transform.any_op, !transform.any_op)
transform.iree.emit_remark "first" at %first : !transform.any_op
transform.iree.emit_remark "second" at %second : !transform.any_op
- }
+ transform.yield
+ } // @__transform_main
+} // module
- module {
- func.func private @f1(f32) -> f32
- func.func private @f2(f32, f32) -> f32
+module {
+ func.func private @f1(f32) -> f32
+ func.func private @f2(f32, f32) -> f32
- func.func @foo() -> tensor<10xf32> {
- %dummy1 = tensor.empty() : tensor<10xf32>
- %dummy2 = tensor.empty() : tensor<10xf32>
- %dummy3 = tensor.empty() : tensor<10xf32>
- %dummy5 = tensor.empty() : tensor<10xf32>
- %c0 = arith.constant 0.0 : f32
- %c5 = arith.constant 5.0 : f32
- %operand5 = linalg.fill ins(%c5 : f32) outs(%dummy5 : tensor<10xf32>) -> tensor<10xf32>
- %operand = linalg.fill ins(%c0 : f32) outs(%dummy1 : tensor<10xf32>) -> tensor<10xf32>
-
- %first = linalg.generic {
- indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
- iterator_types = ["parallel"]
- } ins(%operand : tensor<10xf32>)
- outs(%dummy2 : tensor<10xf32>) {
- ^bb0(%arg0: f32, %arg1: f32):
- %0 = func.call @f1(%arg0) : (f32) -> f32
- linalg.yield %0 : f32
- } -> tensor<10xf32>
+ func.func @foo() -> tensor<10xf32> {
+ %dummy1 = tensor.empty() : tensor<10xf32>
+ %dummy2 = tensor.empty() : tensor<10xf32>
+ %dummy3 = tensor.empty() : tensor<10xf32>
+ %dummy5 = tensor.empty() : tensor<10xf32>
+ %c0 = arith.constant 0.0 : f32
+ %c5 = arith.constant 5.0 : f32
+ %operand5 = linalg.fill ins(%c5 : f32) outs(%dummy5 : tensor<10xf32>) -> tensor<10xf32>
+ %operand = linalg.fill ins(%c0 : f32) outs(%dummy1 : tensor<10xf32>) -> tensor<10xf32>
+
+ %first = linalg.generic {
+ indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
+ iterator_types = ["parallel"]
+ } ins(%operand : tensor<10xf32>)
+ outs(%dummy2 : tensor<10xf32>) {
+ ^bb0(%arg0: f32, %arg1: f32):
+ %0 = func.call @f1(%arg0) : (f32) -> f32
+ linalg.yield %0 : f32
+ } -> tensor<10xf32>
- %second = linalg.generic {
- indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
- iterator_types = ["parallel"]
- } ins(%operand5, %first : tensor<10xf32>, tensor<10xf32>)
- outs(%dummy3 : tensor<10xf32>) {
- ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
- %0 = func.call @f2(%arg0, %arg1) : (f32, f32) -> f32
- linalg.yield %0 : f32
- } -> tensor<10xf32>
- return %second : tensor<10xf32>
- }
+ %second = linalg.generic {
+ indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
+ iterator_types = ["parallel"]
+ } ins(%operand5, %first : tensor<10xf32>, tensor<10xf32>)
+ outs(%dummy3 : tensor<10xf32>) {
+ ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
+ %0 = func.call @f2(%arg0, %arg1) : (f32, f32) -> f32
+ linalg.yield %0 : f32
+ } -> tensor<10xf32>
+ return %second : tensor<10xf32>
}
}
// -----
-module {
- transform.sequence failures(propagate) {
- ^bb0(%arg0: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
transform.iree.register_match_callbacks
// expected-error @+2 {{failed to match}}
%first, %second =
- transform.iree.match_callback failures(propagate) "_test_repeated_matcher_use_callback"(%arg0)
+ transform.iree.match_callback failures(propagate) "_test_repeated_matcher_use_callback"(%root)
: (!transform.any_op) -> (!transform.any_op, !transform.any_op)
transform.iree.emit_remark "first" at %first : !transform.any_op
transform.iree.emit_remark "second" at %second : !transform.any_op
- }
+ transform.yield
+ } // @__transform_main
+} // module
- module {
- func.func private @f1(f32) -> f32
- func.func private @f2(f32, f32) -> f32
+module {
+ func.func private @f1(f32) -> f32
+ func.func private @f2(f32, f32) -> f32
- func.func @foo() -> tensor<10xf32> {
- %dummy1 = tensor.empty() : tensor<10xf32>
- %dummy2 = tensor.empty() : tensor<10xf32>
- %dummy3 = tensor.empty() : tensor<10xf32>
- %dummy5 = tensor.empty() : tensor<10xf32>
- %c0 = arith.constant 0.0 : f32
- %operand = linalg.fill ins(%c0 : f32) outs(%dummy1 : tensor<10xf32>) -> tensor<10xf32>
-
- %first = linalg.generic {
- indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
- iterator_types = ["parallel"]
- } ins(%operand : tensor<10xf32>)
- outs(%dummy2 : tensor<10xf32>) {
- ^bb0(%arg0: f32, %arg1: f32):
- %0 = func.call @f1(%arg0) : (f32) -> f32
- linalg.yield %0 : f32
- } -> tensor<10xf32>
+ func.func @foo() -> tensor<10xf32> {
+ %dummy1 = tensor.empty() : tensor<10xf32>
+ %dummy2 = tensor.empty() : tensor<10xf32>
+ %dummy3 = tensor.empty() : tensor<10xf32>
+ %dummy5 = tensor.empty() : tensor<10xf32>
+ %c0 = arith.constant 0.0 : f32
+ %operand = linalg.fill ins(%c0 : f32) outs(%dummy1 : tensor<10xf32>) -> tensor<10xf32>
+
+ %first = linalg.generic {
+ indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
+ iterator_types = ["parallel"]
+ } ins(%operand : tensor<10xf32>)
+ outs(%dummy2 : tensor<10xf32>) {
+ ^bb0(%arg0: f32, %arg1: f32):
+ %0 = func.call @f1(%arg0) : (f32) -> f32
+ linalg.yield %0 : f32
+ } -> tensor<10xf32>
- %second = linalg.generic {
- indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
- iterator_types = ["parallel"]
- } ins(%first, %first : tensor<10xf32>, tensor<10xf32>)
- outs(%dummy3 : tensor<10xf32>) {
- ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
- %0 = func.call @f2(%arg0, %arg1) : (f32, f32) -> f32
- linalg.yield %0 : f32
- } -> tensor<10xf32>
- return %second : tensor<10xf32>
- }
+ %second = linalg.generic {
+ indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
+ iterator_types = ["parallel"]
+ } ins(%first, %first : tensor<10xf32>, tensor<10xf32>)
+ outs(%dummy3 : tensor<10xf32>) {
+ ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
+ %0 = func.call @f2(%arg0, %arg1) : (f32, f32) -> f32
+ linalg.yield %0 : f32
+ } -> tensor<10xf32>
+ return %second : tensor<10xf32>
}
}
// -----
-module {
- transform.sequence failures(propagate) {
- ^bb0(%arg0: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
transform.iree.register_match_callbacks
%first, %second =
- transform.iree.match_callback failures(propagate) "_test_value_matcher_callback"(%arg0)
+ transform.iree.match_callback failures(propagate) "_test_value_matcher_callback"(%root)
: (!transform.any_op) -> (!transform.any_op, !transform.any_op)
transform.iree.emit_remark "first" at %first : !transform.any_op
transform.iree.emit_remark "second" at %second : !transform.any_op
- }
+ transform.yield
+ } // @__transform_main
+} // module
- module {
- func.func private @f1(f32) -> f32
- func.func private @f2(f32, f32) -> f32
+module {
+ func.func private @f1(f32) -> f32
+ func.func private @f2(f32, f32) -> f32
- func.func @foo() -> tensor<10xf32> {
- %dummy1 = tensor.empty() : tensor<10xf32>
- %dummy2 = tensor.empty() : tensor<10xf32>
- %dummy3 = tensor.empty() : tensor<10xf32>
- %operand = tensor.empty() : tensor<10xf32>
+ func.func @foo() -> tensor<10xf32> {
+ %dummy1 = tensor.empty() : tensor<10xf32>
+ %dummy2 = tensor.empty() : tensor<10xf32>
+ %dummy3 = tensor.empty() : tensor<10xf32>
+ %operand = tensor.empty() : tensor<10xf32>
- // expected-remark @below {{first}}
- %first = linalg.generic {
- indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
- iterator_types = ["parallel"]
- } ins(%operand : tensor<10xf32>)
- outs(%dummy2 : tensor<10xf32>) {
- ^bb0(%arg0: f32, %arg1: f32):
- %0 = func.call @f1(%arg0) : (f32) -> f32
- linalg.yield %0 : f32
- } -> tensor<10xf32>
+ // expected-remark @below {{first}}
+ %first = linalg.generic {
+ indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
+ iterator_types = ["parallel"]
+ } ins(%operand : tensor<10xf32>)
+ outs(%dummy2 : tensor<10xf32>) {
+ ^bb0(%arg0: f32, %arg1: f32):
+ %0 = func.call @f1(%arg0) : (f32) -> f32
+ linalg.yield %0 : f32
+ } -> tensor<10xf32>
- // expected-remark @below {{second}}
- %second = linalg.generic {
- indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
- iterator_types = ["parallel"]
- } ins(%operand, %first : tensor<10xf32>, tensor<10xf32>)
- outs(%dummy3 : tensor<10xf32>) {
- ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
- %0 = func.call @f2(%arg0, %arg1) : (f32, f32) -> f32
- linalg.yield %0 : f32
- } -> tensor<10xf32>
- return %second : tensor<10xf32>
- }
+ // expected-remark @below {{second}}
+ %second = linalg.generic {
+ indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
+ iterator_types = ["parallel"]
+ } ins(%operand, %first : tensor<10xf32>, tensor<10xf32>)
+ outs(%dummy3 : tensor<10xf32>) {
+ ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
+ %0 = func.call @f2(%arg0, %arg1) : (f32, f32) -> f32
+ linalg.yield %0 : f32
+ } -> tensor<10xf32>
+ return %second : tensor<10xf32>
}
}
// -----
-module {
- transform.sequence failures(propagate) {
- ^bb0(%arg0: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
transform.iree.register_match_callbacks
- %0 = transform.iree.match_callback failures(propagate) "_test_shaped_value_matcher_callback"(%arg0)
+ %0 = transform.iree.match_callback failures(propagate) "_test_shaped_value_matcher_callback"(%root)
: (!transform.any_op) -> !transform.any_op
transform.iree.emit_remark "matched" at %0 : !transform.any_op
- }
+ transform.yield
+ } // @__transform_main
+} // module
- module {
- func.func @foo(%arg0: tensor<42x10xf32>) -> tensor<10x42xf32> {
- %init = tensor.empty() : tensor<10x42xf32>
- // expected-remark @below {{rank: 2}}
- // expected-remark @below {{dimensions: 10, 42}}
- // expected-remark @below {{matched}}
- %0 = linalg.generic {
- indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel"]
- } ins(%arg0: tensor<42x10xf32>)
- outs(%init: tensor<10x42xf32>) {
- ^bb0(%arg1: f32, %arg2: f32):
- linalg.yield %arg1 : f32
- } -> tensor<10x42xf32>
- return %0 : tensor<10x42xf32>
- }
+module {
+ func.func @foo(%arg0: tensor<42x10xf32>) -> tensor<10x42xf32> {
+ %init = tensor.empty() : tensor<10x42xf32>
+ // expected-remark @below {{rank: 2}}
+ // expected-remark @below {{dimensions: 10, 42}}
+ // expected-remark @below {{matched}}
+ %0 = linalg.generic {
+ indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>],
+ iterator_types = ["parallel", "parallel"]
+ } ins(%arg0: tensor<42x10xf32>)
+ outs(%init: tensor<10x42xf32>) {
+ ^bb0(%arg1: f32, %arg2: f32):
+ linalg.yield %arg1 : f32
+ } -> tensor<10x42xf32>
+ return %0 : tensor<10x42xf32>
}
}
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/transform_buffer_opt.mlir b/compiler/src/iree/compiler/Codegen/Common/test/transform_buffer_opt.mlir
index a4e1fc4..082d851 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/transform_buffer_opt.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/transform_buffer_opt.mlir
@@ -15,8 +15,10 @@
return %r : vector<4xf32>
}
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
- %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
- transform.memref.erase_dead_alloc_and_stores %0 : (!transform.any_op) -> ()
-}
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.any_op
+ transform.memref.erase_dead_alloc_and_stores %0 : (!transform.any_op) -> ()
+ transform.yield
+ } // @__transform_main
+} // module
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/transform_dialect_apply_pattern_op.mlir b/compiler/src/iree/compiler/Codegen/Common/test/transform_dialect_apply_pattern_op.mlir
deleted file mode 100644
index 684e863..0000000
--- a/compiler/src/iree/compiler/Codegen/Common/test/transform_dialect_apply_pattern_op.mlir
+++ /dev/null
@@ -1,261 +0,0 @@
-// RUN: iree-opt %s -iree-transform-dialect-interpreter -transform-dialect-drop-schedule --split-input-file | FileCheck %s
-
-// CHECK-LABEL: @select_cmp_eq_select
-// CHECK: return %arg1
-func.func @select_cmp_eq_select(%arg0: i64, %arg1: i64) -> i64 {
- %0 = arith.cmpi eq, %arg0, %arg1 : i64
- %1 = arith.select %0, %arg0, %arg1 : i64
- return %1 : i64
-}
-
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
- %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %0 {
- transform.apply_patterns.canonicalization
- } : !transform.any_op
-}
-
-// -----
-
-#map0 = affine_map<(d0, d1) -> (d0, d1)>
-#map2 = affine_map<(d0) -> (d0 * 4)>
-
-// CHECK-LABEL: @promote
-func.func @promote() -> (tensor<16x128xf32>) {
- %c0 = arith.constant 0 : index
- %f0 = arith.constant 0.000000e+00 : f32
- %c16 = arith.constant 16 : index
- %c32 = arith.constant 32 : index
-
- %empty = tensor.empty() : tensor<16x128xf32>
- %filled = linalg.fill ins(%f0 : f32) outs(%empty : tensor<16x128xf32>) -> tensor<16x128xf32>
-
- // CHECK: forall{{.*}}shared_outs(%[[ARG:.*]] =
- // CHECK: %[[A:.*]] = tensor.extract_slice %[[ARG]]
- // CHECK: %[[B:.*]] = tensor.extract_slice %[[ARG]]
- // CHECK: %[[C:.*]] = linalg.generic{{.*}}ins(%[[A]]{{.*}}outs(%[[B]]
- %10 = scf.forall (%arg0, %arg1) in (%c16, %c32) shared_outs(%arg2 = %filled) -> (tensor<16x128xf32>) {
- %11 = affine.apply #map2(%arg1)
- %extracted_slice = tensor.extract_slice %filled[%arg0, %11] [1, 4] [1, 1] : tensor<16x128xf32> to tensor<1x4xf32>
- %extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %11] [1, 4] [1, 1] : tensor<16x128xf32> to tensor<1x4xf32>
- %13 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice : tensor<1x4xf32>) outs(%extracted_slice_2 : tensor<1x4xf32>) {
- ^bb0(%in: f32, %out: f32):
- %res = arith.addf %in, %in: f32
- linalg.yield %res : f32
- } -> tensor<1x4xf32>
- scf.forall.in_parallel {
- tensor.parallel_insert_slice %13 into %arg2[%arg0, %11] [1, 4] [1, 1] : tensor<1x4xf32> into tensor<16x128xf32>
- }
- }
- return %10 : tensor<16x128xf32>
-}
-
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
- %0 = transform.structured.match ops{["scf.forall"]} in %arg1 : (!transform.any_op) -> !transform.any_op
- %1 = transform.cast %0 : !transform.any_op to !transform.op<"scf.forall">
- transform.iree.share_forall_operands %1 share_operands = [0] : (!transform.op<"scf.forall">) -> !transform.op<"scf.forall">
-}
-
-// -----
-
-#map2 = affine_map<(d0, d1) -> (d0, d1)>
-
-func.func private @mutate(f32) -> f32
-
-// CHECK-LABEL: @bubble_up
-func.func @bubble_up(%arg0: tensor<32x64xf32>) -> tensor<32x2x32xf32> {
- // Check that shape expansion precedes linalg.generic after the patterns were applied.
- // CHECK: tensor.expand_shape
- // CHECK: tensor.expand_shape
- // CHECK: linalg.generic
- %init = tensor.empty() : tensor<32x64xf32>
- %result = linalg.generic {
- indexing_maps = [#map2, #map2],
- iterator_types = ["parallel", "parallel"]}
- ins(%arg0: tensor<32x64xf32>) outs(%init: tensor<32x64xf32>) {
- ^bb0(%arg1: f32, %arg2: f32):
- %0 = func.call @mutate(%arg1) : (f32) -> f32
- linalg.yield %0 : f32
- } -> tensor<32x64xf32>
- %out = tensor.expand_shape %result[[0], [1, 2]] : tensor<32x64xf32> into tensor<32x2x32xf32>
- return %out : tensor<32x2x32xf32>
-}
-
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
- %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %0 {
- transform.apply_patterns.iree.bubble_expand
- } : !transform.any_op
-}
-
-// -----
-
-// CHECK-LABEL: @pad_fill_to_fill
-func.func @pad_fill_to_fill(%arg0: tensor<31x62xf32>) -> tensor<32x64xf32> {
- // Check that a pad of a fill with the same constant is replaced by a
- // bigger fill.
- // CHECK-DAG: %[[FILL_CST:.*]] = arith.constant 0.0{{0*e\+00}} : f32
- // CHECK-DAG: %[[EMPTY:.*]] = tensor.empty() : tensor<32x64xf32>
- // CHECK: %[[PADDED_FILL:.*]] = linalg.fill ins(%[[FILL_CST]] : f32) outs(%[[EMPTY]] : tensor<32x64xf32>) -> tensor<32x64xf32>
- // CHECK: return %[[PADDED_FILL]]
- %cst = arith.constant 0.0 : f32
- %c0 = arith.constant 0 : index
- %c1 = arith.constant 1 : index
- %c2 = arith.constant 2 : index
- %fill = linalg.fill ins(%cst : f32) outs(%arg0 : tensor<31x62xf32>) -> tensor<31x62xf32>
- %padded = tensor.pad %fill low[%c0, %c0] high[%c1, %c2] {
- ^bb0(%arg3: index, %arg4: index):
- tensor.yield %cst : f32
- } : tensor<31x62xf32> to tensor<32x64xf32>
- return %padded : tensor<32x64xf32>
-}
-
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
- %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %0 {
- transform.apply_patterns.iree.fold_fill_into_pad
- transform.apply_patterns.linalg.tiling_canonicalization
- transform.apply_patterns.scf.for_loop_canonicalization
- } : !transform.any_op
-}
-
-// -----
-
-// CHECK-LABEL: @pad_fill_different_ssa_value_but_same_cst
-func.func @pad_fill_different_ssa_value_but_same_cst(%arg0: tensor<31x62xf32>) -> tensor<32x64xf32> {
- // Check that a pad of a fill with the same constant is replaced by a
- // bigger fill even when the constant comes from different ssa value.
- // CHECK-DAG: %[[FILL_CST:.*]] = arith.constant 0.0{{0*e\+00}} : f32
- // CHECK-DAG: %[[EMPTY:.*]] = tensor.empty() : tensor<32x64xf32>
- // CHECK: %[[PADDED_FILL:.*]] = linalg.fill ins(%[[FILL_CST]] : f32) outs(%[[EMPTY]] : tensor<32x64xf32>) -> tensor<32x64xf32>
- // CHECK: return %[[PADDED_FILL]]
- %cst = arith.constant 0.0 : f32
- %cst2 = arith.constant 0.0 : f32
- %c0 = arith.constant 0 : index
- %c1 = arith.constant 1 : index
- %c2 = arith.constant 2 : index
- %fill = linalg.fill ins(%cst : f32) outs(%arg0 : tensor<31x62xf32>) -> tensor<31x62xf32>
- %padded = tensor.pad %fill low[%c0, %c0] high[%c1, %c2] {
- ^bb0(%arg3: index, %arg4: index):
- tensor.yield %cst2 : f32
- } : tensor<31x62xf32> to tensor<32x64xf32>
- return %padded : tensor<32x64xf32>
-}
-
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
- %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %0 {
- transform.apply_patterns.iree.fold_fill_into_pad
- transform.apply_patterns.linalg.tiling_canonicalization
- transform.apply_patterns.scf.for_loop_canonicalization
- } : !transform.any_op
-}
-
-// -----
-
-// CHECK-LABEL: @pad_extract_fill_to_fill
-func.func @pad_extract_fill_to_fill(%arg0: tensor<31x62xf32>,
- %size0 : index, %size1 : index,
- %high0 : index, %high1 : index) -> tensor<32x64xf32> {
- // Check that a pad of a fill with the same constant is replaced by a
- // bigger fill even when the fill is hidden behind an extract_slice.
- // CHECK-DAG: %[[FILL_CST:.*]] = arith.constant 0.0{{0*e\+00}} : f32
- // CHECK-DAG: %[[EMPTY:.*]] = tensor.empty() : tensor<32x64xf32>
- // CHECK: %[[PADDED_FILL:.*]] = linalg.fill ins(%[[FILL_CST]] : f32) outs(%[[EMPTY]] : tensor<32x64xf32>) -> tensor<32x64xf32>
- // CHECK: return %[[PADDED_FILL]]
- %cst = arith.constant 0.0 : f32
- %cst2 = arith.constant 0.0 : f32
- %c0 = arith.constant 0 : index
- %fill = linalg.fill ins(%cst : f32) outs(%arg0 : tensor<31x62xf32>) -> tensor<31x62xf32>
- %extracted_slice = tensor.extract_slice %fill[0, 0] [%size0, %size1] [1, 1] : tensor<31x62xf32> to tensor<?x?xf32>
- %padded = tensor.pad %extracted_slice low[%c0, %c0] high[%high0, %high1] {
- ^bb0(%arg3: index, %arg4: index):
- tensor.yield %cst2 : f32
- } : tensor<?x?xf32> to tensor<32x64xf32>
- return %padded : tensor<32x64xf32>
-}
-
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
- %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %0 {
- transform.apply_patterns.iree.fold_fill_into_pad
- transform.apply_patterns.linalg.tiling_canonicalization
- transform.apply_patterns.scf.for_loop_canonicalization
- } : !transform.any_op
-}
-
-// -----
-
-// CHECK-LABEL: @pad_extract_extract_fill_to_fill
-func.func @pad_extract_extract_fill_to_fill(%arg0: tensor<31x62xf32>,
- %size0a : index, %size1a : index,
- %size0b : index, %size1b : index,
- %high0 : index, %high1 : index) -> tensor<32x64xf32> {
- // Check that a pad of a fill with the same constant is replaced by a
- // bigger fill even when the fill is hidden behind a few `extract_slice`s.
- // CHECK-DAG: %[[FILL_CST:.*]] = arith.constant 0.0{{0*e\+00}} : f32
- // CHECK-DAG: %[[EMPTY:.*]] = tensor.empty() : tensor<32x64xf32>
- // CHECK: %[[PADDED_FILL:.*]] = linalg.fill ins(%[[FILL_CST]] : f32) outs(%[[EMPTY]] : tensor<32x64xf32>) -> tensor<32x64xf32>
- // CHECK: return %[[PADDED_FILL]]
- %cst = arith.constant 0.0 : f32
- %cst2 = arith.constant 0.0 : f32
- %c0 = arith.constant 0 : index
- %fill = linalg.fill ins(%cst : f32) outs(%arg0 : tensor<31x62xf32>) -> tensor<31x62xf32>
- %extracted_sliceA = tensor.extract_slice %fill[0, 0] [%size0a, %size1a] [1, 1] : tensor<31x62xf32> to tensor<?x?xf32>
- %extracted_sliceB = tensor.extract_slice %extracted_sliceA[0, 0] [%size0b, %size1b] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
- %padded = tensor.pad %extracted_sliceB low[%c0, %c0] high[%high0, %high1] {
- ^bb0(%arg3: index, %arg4: index):
- tensor.yield %cst2 : f32
- } : tensor<?x?xf32> to tensor<32x64xf32>
- return %padded : tensor<32x64xf32>
-}
-
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
- %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %0 {
- transform.apply_patterns.iree.fold_fill_into_pad
- transform.apply_patterns.linalg.tiling_canonicalization
- transform.apply_patterns.scf.for_loop_canonicalization
- } : !transform.any_op
-}
-
-// -----
-
-// CHECK-LABEL: @pad_extract_bigger_fill_to_fill
-func.func @pad_extract_bigger_fill_to_fill(%arg0: tensor<253x123xf32>,
- %size0 : index, %size1 : index,
- %high0 : index, %high1 : index) -> tensor<32x64xf32> {
- // Check that a pad of a bigger fill with the same constant is replaced by a
- // fill of the right size.
- // CHECK-DAG: %[[FILL_CST:.*]] = arith.constant 0.0{{0*e\+00}} : f32
- // CHECK-DAG: %[[EMPTY:.*]] = tensor.empty() : tensor<32x64xf32>
- // CHECK: %[[PADDED_FILL:.*]] = linalg.fill ins(%[[FILL_CST]] : f32) outs(%[[EMPTY]] : tensor<32x64xf32>) -> tensor<32x64xf32>
- // CHECK: return %[[PADDED_FILL]]
- %cst = arith.constant 0.0 : f32
- %cst2 = arith.constant 0.0 : f32
- %c0 = arith.constant 0 : index
- %fill = linalg.fill ins(%cst : f32) outs(%arg0 : tensor<253x123xf32>) -> tensor<253x123xf32>
- %extracted_slice = tensor.extract_slice %fill[0, 0] [%size0, %size1] [1, 1] : tensor<253x123xf32> to tensor<?x?xf32>
- %padded = tensor.pad %extracted_slice low[%c0, %c0] high[%high0, %high1] {
- ^bb0(%arg3: index, %arg4: index):
- tensor.yield %cst2 : f32
- } : tensor<?x?xf32> to tensor<32x64xf32>
- return %padded : tensor<32x64xf32>
-}
-
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
- %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %0 {
- transform.apply_patterns.iree.fold_fill_into_pad
- transform.apply_patterns.linalg.tiling_canonicalization
- transform.apply_patterns.scf.for_loop_canonicalization
- } : !transform.any_op
-}
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/transform_match_partial_reduction.mlir b/compiler/src/iree/compiler/Codegen/Common/test/transform_match_partial_reduction.mlir
index 61baa7f..3e4e546 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/transform_match_partial_reduction.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/transform_match_partial_reduction.mlir
@@ -24,20 +24,22 @@
return %result, %fill2 : tensor<8xf32>, tensor<32xf32>
}
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
- transform.iree.register_match_callbacks
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
+ transform.iree.register_match_callbacks
- %leading, %fill, %reduction, %trailing =
- transform.iree.match_callback failures(propagate) "reduction_partial"(%arg0)
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+ %leading, %fill, %reduction, %trailing =
+ transform.iree.match_callback failures(propagate) "reduction_partial"(%root)
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
- transform.iree.emit_remark "leading" at %leading : !transform.any_op
- transform.iree.emit_remark "fill" at %fill : !transform.any_op
- transform.iree.emit_remark "reduction" at %reduction : !transform.any_op
- transform.iree.emit_remark "trailing" at %trailing : !transform.any_op
+ transform.iree.emit_remark "leading" at %leading : !transform.any_op
+ transform.iree.emit_remark "fill" at %fill : !transform.any_op
+ transform.iree.emit_remark "reduction" at %reduction : !transform.any_op
+ transform.iree.emit_remark "trailing" at %trailing : !transform.any_op
- // expected-error @below {{failed to match}}
- transform.iree.match_callback failures(propagate) "reduction"(%arg0)
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-}
+ // expected-error @below {{failed to match}}
+ transform.iree.match_callback failures(propagate) "reduction"(%root)
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+ transform.yield
+ } // @__transform_main
+} // module
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/transform_ops_invalid.mlir b/compiler/src/iree/compiler/Codegen/Common/test/transform_ops_invalid.mlir
index 8a5deaf..e8be084 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/transform_ops_invalid.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/transform_ops_invalid.mlir
@@ -1,75 +1,70 @@
// RUN: iree-opt %s --split-input-file --iree-transform-dialect-interpreter --verify-diagnostics
-module {
- transform.sequence failures(propagate) {
- ^bb0(%arg0: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
// expected-error @below {{match registry not available}}
transform.iree.match_callback failures(suppress) "_test_match_callback"() : () -> ()
- }
-}
+ transform.yield
+ } // @__transform_main
+} // module
+
// -----
-module {
- transform.sequence failures(propagate) {
- ^bb0(%arg0: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
transform.iree.register_match_callbacks
// expected-error @below {{callback '_non_existing_name_' not found in the registry}}
transform.iree.match_callback failures(suppress) "_non_existing_name_"() : () -> ()
- }
-}
+ transform.yield
+ } // @__transform_main
+} // module
+
// -----
-module {
- transform.sequence failures(propagate) {
- ^bb0(%arg0: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
transform.iree.register_match_callbacks
// expected-error @below {{callback produced a different number of handles than expected}}
- transform.iree.match_callback failures(suppress) "_test_match_callback"(%arg0) : (!transform.any_op) -> ()
- }
-}
+ transform.iree.match_callback failures(suppress) "_test_match_callback"(%root) : (!transform.any_op) -> ()
+ transform.yield
+ } // @__transform_main
+} // module
+
// -----
// Successful match.
-module {
- transform.sequence failures(propagate) {
- ^bb0(%arg0: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
transform.iree.register_match_callbacks
- transform.iree.match_callback failures(propagate) "_test_match_callback"(%arg0) : (!transform.any_op) -> (!transform.any_op)
- }
-}
+ transform.iree.match_callback failures(propagate) "_test_match_callback"(%root) : (!transform.any_op) -> (!transform.any_op)
+ transform.yield
+ } // @__transform_main
+} // module
+
// -----
-module attributes {test.iree_transform_do_not_match} {
- transform.sequence failures(propagate) {
- ^bb0(%arg0: !transform.any_op):
+module attributes { transform.with_named_sequence , test.iree_transform_do_not_match } {
+ transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
transform.iree.register_match_callbacks
// expected-error @below {{failed to match}}
- transform.iree.match_callback failures(propagate) "_test_match_callback"(%arg0) : (!transform.any_op) -> (!transform.any_op)
- }
-}
+ transform.iree.match_callback failures(propagate) "_test_match_callback"(%root) : (!transform.any_op) -> (!transform.any_op)
+ transform.yield
+ } // @__transform_main
+} // module
+
// -----
// Failed to match, but the op silences such errors.
-module attributes {test.iree_transform_do_not_match} {
- transform.sequence failures(propagate) {
- ^bb0(%arg0: !transform.any_op):
+module attributes { transform.with_named_sequence, test.iree_transform_do_not_match } {
+ transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
transform.iree.register_match_callbacks
- transform.iree.match_callback failures(suppress) "_test_match_callback"(%arg0) : (!transform.any_op) -> (!transform.any_op)
- }
-}
+ transform.iree.match_callback failures(suppress) "_test_match_callback"(%root) : (!transform.any_op) -> (!transform.any_op)
+ transform.yield
+ } // @__transform_main
+} // module
-// -----
-
-// Failed to match, but the parent sequence silences all errors.
-module attributes {test.iree_transform_do_not_match} {
- transform.sequence failures(suppress) {
- ^bb0(%arg0: !transform.any_op):
- transform.iree.register_match_callbacks
- transform.iree.match_callback failures(propagate) "_test_match_callback"(%arg0) : (!transform.any_op) -> (!transform.any_op)
- }
-}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/BUILD.bazel
index a57d686..9784944 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/BUILD.bazel
@@ -6,8 +6,8 @@
# Tests for common transforms.
-load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
load("//build_tools/bazel:enforce_glob.bzl", "enforce_glob")
+load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
package(
features = ["layering_check"],
@@ -56,7 +56,6 @@
"tile.mlir",
"tile_and_fuse.mlir",
"transform_dialect_bufferize.mlir",
- "transform_dialect_iree_tile_to_forall.mlir",
"transpose_avx2_lowering.mlir",
"unfused_fma.mlir",
"vector_contract_to_arm_asm.mlir",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
index cdc786f..2add8b7 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
@@ -51,7 +51,6 @@
"tile.mlir"
"tile_and_fuse.mlir"
"transform_dialect_bufferize.mlir"
- "transform_dialect_iree_tile_to_forall.mlir"
"transpose_avx2_lowering.mlir"
"unfused_fma.mlir"
"vector_contract_to_arm_asm.mlir"
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/transform_dialect_bufferize.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/transform_dialect_bufferize.mlir
index 905690f..273543b 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/transform_dialect_bufferize.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/transform_dialect_bufferize.mlir
@@ -35,9 +35,11 @@
}
}
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
- transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
- %variant_op_3 = transform.iree.bufferize %variant_op : (!transform.any_op) -> (!transform.any_op)
- %func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-}
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.consumed}) {
+ transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+ %variant_op_3 = transform.iree.bufferize %variant_op : (!transform.any_op) -> (!transform.any_op)
+ %func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ transform.yield
+ }
+} // module
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/transform_dialect_iree_tile_to_forall.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/transform_dialect_iree_tile_to_forall.mlir
deleted file mode 100644
index ac1f307..0000000
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/transform_dialect_iree_tile_to_forall.mlir
+++ /dev/null
@@ -1,168 +0,0 @@
-// RUN: iree-opt %s -iree-transform-dialect-interpreter -transform-dialect-drop-schedule --cse --split-input-file | FileCheck %s
-
-// Check that we can specify `num_threads` when lowering
-// `workgroup_count_from_slice` using
-// `transform.iree.populate_workgroup_count_region_using_num_threads_slice`
-
-
-#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>
-
-// Check that num_threads (32) is reflected in the map.
-// CHECK: #[[$NUM_THREADS_MAP:.*]] = affine_map<(d0) -> (d0 * 32)>
-
-hal.executable private @matmul_static_dispatch_0 {
- hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
-
- hal.executable.export public @matmul_static_dispatch_0_matmul_1024x4096x12345 ordinal(0) layout(#pipeline_layout) {
- // Check that num_threads is reflected in the workgroup size.
- // CHECK-LABEL: hal.executable.export public @matmul_static_dispatch_0_matmul_1024x4096x12345
- // CHECK-DAG: %[[C32:.*]] = arith.constant 32 : index
- // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
- // CHECK: hal.return %[[C32]], %[[C1]], %[[C1]] : index, index, index
- ^bb0(%arg0: !hal.device):
- %x, %y, %z = flow.dispatch.workgroup_count_from_slice
- hal.return %x, %y, %z : index, index, index
- }
-
- builtin.module {
- func.func @matmul_static_dispatch_0_matmul_1024x4096x12345() {
- // Check that the tiling matches num_threads.
- // CHECK-LABEL: func.func @matmul_static_dispatch_0_matmul_1024x4096x12345
- // CHECK: = scf.forall (%[[IV:.*]]) in (32) shared_outs(%{{.*}}) -> (tensor<1024x4096xf32>) {
- // CHECK: %[[OFFSET:.*]] = affine.apply #[[$NUM_THREADS_MAP]](%[[IV]])
- // CHECK: %extracted_slice = tensor.extract_slice %{{.*}}[%[[OFFSET]], 0] [32, 12345] [1, 1] : tensor<1024x12345xf32> to tensor<32x12345xf32>
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x12345xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<12345x4096xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<1024x4096xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 12345], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x12345xf32>> -> tensor<1024x12345xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [12345, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<12345x4096xf32>> -> tensor<12345x4096xf32>
- %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1024, 4096], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<1024x4096xf32>> -> tensor<1024x4096xf32>
- %6 = linalg.matmul ins(%3, %4 : tensor<1024x12345xf32>, tensor<12345x4096xf32>) outs(%5 : tensor<1024x4096xf32>) -> tensor<1024x4096xf32>
- flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1024, 4096], strides = [1, 1] : tensor<1024x4096xf32> -> !flow.dispatch.tensor<readwrite:tensor<1024x4096xf32>>
- return
- }
- }
- }
-}
-
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
- %original_matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op
- : (!transform.any_op) -> !transform.any_op
-
- %matmul, %forall =
- transform.structured.tile_using_forall %original_matmul num_threads [32]
- ( mapping = [#gpu.block<x>] )
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
- // Late canonicalizations to cleanup and pass the checks.
- // Needs to occur on the whole variant to perform cse on the workgroup_count region
- transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall
- : (!transform.any_op) -> ()
-}
-
-// -----
-
-#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>
-
-hal.executable private @matmul_static_dispatch_0 {
- hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
-
- hal.executable.export public @elementwise_out_of_order_block_id ordinal(0) layout(#pipeline_layout) {
- // Check that num_threads is consistent with the specified mapping
- // CHECK-LABEL: hal.executable.export public @elementwise_out_of_order_block_id
-
- // CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index
- // CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index
- // CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index
- // CHECK: hal.return %[[C3]], %[[C5]], %[[C8]] : index, index, index
- ^bb0(%arg0: !hal.device):
- %x, %y, %z = flow.dispatch.workgroup_count_from_slice
- hal.return %x, %y, %z : index, index, index
- }
-
- builtin.module {
- func.func @elementwise_out_of_order_block_id() {
- // CHECK-LABEL: func.func @elementwise_out_of_order_block_id
- // CHECK: = scf.forall (%[[IV:.*]]) in (3, 5, 8) shared_outs(%{{.*}}) -> (tensor<3x5x8xf32>) {
- // CHECK: } {mapping = [#gpu.block<x>, #gpu.block<y>, #gpu.block<z>]}
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<3x5x8xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<3x5x8xf32>>
- %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [3, 5, 8], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x5x8xf32>> -> tensor<3x5x8xf32>
- %empty = tensor.empty() : tensor<3x5x8xf32>
- %3 = linalg.generic {
- indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
- affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
- iterator_types = ["parallel", "parallel", "parallel"]}
- ins(%2 : tensor<3x5x8xf32>) outs(%empty : tensor<3x5x8xf32>) {
- ^bb0(%in: f32, %in_0: f32):
- %4 = math.sqrt %in : f32
- linalg.yield %4 : f32
- } -> tensor<3x5x8xf32>
- flow.dispatch.tensor.store %3, %1, offsets = [0, 0, 0], sizes = [3, 5, 8], strides = [1, 1, 1] : tensor<3x5x8xf32> -> !flow.dispatch.tensor<writeonly:tensor<3x5x8xf32>>
- return
- }
- }
- }
-}
-
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
- %1 = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- %tiled_op, %forall_op = transform.structured.tile_using_forall %1 num_threads [] tile_sizes [1, 1, 1](mapping = [#gpu.block<x>, #gpu.block<y>, #gpu.block<z>]): (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_op : (!transform.any_op) -> ()
-}
-
-// -----
-
-#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
-
-hal.executable private @matmul_static_dispatch_0 {
- hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
-
- hal.executable.export public @vecadd2d_dispatch_0_generic_9x512_f32 ordinal(0) layout(#pipeline_layout) {
- // Check that num_threads is consistent with the specified mapping
- // CHECK-LABEL: hal.executable.export public @vecadd2d_dispatch_0_generic_9x512_f32
-
- // CHECK-DAG: %[[C171:.*]] = arith.constant 171 : index
- // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
- // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
- // CHECK: hal.return %[[C171]], %[[C1]], %[[C2]] : index, index, index
- ^bb0(%arg0: !hal.device):
- %x, %y, %z = flow.dispatch.workgroup_count_from_slice
- hal.return %x, %y, %z : index, index, index
- }
-
- builtin.module {
- func.func @vecadd2d_dispatch_0_generic_9x512_f32() {
- %c18432 = arith.constant 18432 : index
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c18432) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<9x512xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x9xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<512x9xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [9, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<9x512xf32>> -> tensor<9x512xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 9], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x9xf32>> -> tensor<512x9xf32>
- %5 = tensor.empty() : tensor<512x9xf32>
- %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1, d0)>], iterator_types = ["parallel", "parallel"]} ins(%3, %4 : tensor<9x512xf32>, tensor<512x9xf32>) outs(%5 : tensor<512x9xf32>) {
- ^bb0(%in: f32, %in_0: f32, %out: f32):
- %7 = arith.addf %in, %in_0 : f32
- linalg.yield %7 : f32
- } -> tensor<512x9xf32>
- flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [512, 9], strides = [1, 1] : tensor<512x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<512x9xf32>>
- return
- }
- }
- }
-}
-
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
- %1 = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- %tiled_op, %forall_op = transform.structured.tile_using_forall %1 num_threads [] tile_sizes [5, 3](mapping = [#gpu.block<z>, #gpu.block<x>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_op : (!transform.any_op) -> ()
-}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
index 3d72d66..a05a27d 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
@@ -66,6 +66,7 @@
# tensor_dialect_*_spec is a an MLIR file that specifies a
# transformation, it needs to be included as data.
exclude = [
+ "attention_transform_spec.mlir",
"transform_dialect_codegen_bufferize_spec.mlir",
"transform_dialect_codegen_foreach_to_gpu_spec.mlir",
"transform_dialect_codegen_vector_distribution_spec.mlir",
@@ -74,6 +75,7 @@
),
cfg = "//compiler:lit.cfg.py",
data = [
+ "attention_transform_spec.mlir",
"transform_dialect_codegen_bufferize_spec.mlir",
"transform_dialect_codegen_foreach_to_gpu_spec.mlir",
"transform_dialect_codegen_vector_distribution_spec.mlir",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
index 0997aa8..562dbad 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
@@ -61,6 +61,7 @@
FileCheck
iree-opt
DATA
+ attention_transform_spec.mlir
transform_dialect_codegen_bufferize_spec.mlir
transform_dialect_codegen_foreach_to_gpu_spec.mlir
transform_dialect_codegen_vector_distribution_spec.mlir
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention.mlir
index 313a3f1..840fa2a 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention.mlir
@@ -1,6 +1,5 @@
-// RUN: iree-opt %s --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-select-lowering-strategy, iree-llvmgpu-lower-executable-target)))' \
-// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN: --iree-codegen-use-transform-dialect-strategy=%s | \
+// RUN: iree-opt %s --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-transform-dialect-interpreter)))' \
+// RUN: --iree-codegen-transform-dialect-library=%p/attention_transform_spec.mlir| \
// RUN: FileCheck --check-prefix=CHECK %s
hal.executable @_attention_dispatch_0 {
@@ -29,133 +28,6 @@
}
}
-transform.sequence failures(propagate) {
- ^bb0(%variant_op: !transform.any_op):
-
- // Get attention op
- // ==========================================
- %attention = transform.structured.match ops{["iree_linalg_ext.attention"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-
- // Tile and distribute to workgroups
- // ==========================================
- %tiled_attention, %forall_grid =
- transform.structured.tile_using_forall %attention tile_sizes [1, 128]
- ( mapping = [#gpu.block<x>, #gpu.block<y>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
-
- // Tile batch dimensions of attention
- // ==========================================
- %attention2 = transform.structured.match ops{["iree_linalg_ext.attention"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- %batch_tiled_attn, %loop = transform.structured.tile_using_for %attention2 [1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %top_level_func {
- transform.apply_patterns.canonicalization
- } : !transform.any_op
- transform.iree.apply_cse %top_level_func : !transform.any_op
-
- // Promote query and output operands
- // ==========================================
- %attention3 = transform.structured.match ops{["iree_linalg_ext.attention"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- %promoted_attention, %alloc_a0, %alloc_a1 = transform.iree.promote_operands %attention3 [0, 3]
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
- // Tile and decompose attention
- // ==========================================
- %attention4 = transform.structured.match ops{["iree_linalg_ext.attention"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- %acc_fill, %max_fill, %sum_fill, %inner_loop,
- %fill_op, %first_matmul, %reduce_max, %partial_softmax, %update, %reduce_sum, %reciprocal_sum, %softmax, %truncate, %scale_acc, %second_matmul, %last_truncate
- = tile_and_decompose_attention %attention4 :
- (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-
- // Promote key and value operands
- // ==========================================
- %promoted_first_matmul, %alloc0 = transform.iree.promote_operands %first_matmul [1]
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- %promoted_second_matmul, %alloc1 = transform.iree.promote_operands %second_matmul [1]
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
- // Tile and fuse attention ops
- // ==========================================
- %tiled_matmul, %forall = transform.structured.tile_using_forall %promoted_second_matmul tile_sizes [32] (mapping = [#gpu.warp<linear_dim_0>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
- %f0, %loop0 = transform.structured.fuse_into_containing_op %scale_acc into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
- %f1, %loop1 = transform.structured.fuse_into_containing_op %truncate into %loop0 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
- %f2, %loop2 = transform.structured.fuse_into_containing_op %softmax into %loop1 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
- %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- transform.iree.apply_cse %func : !transform.any_op
-
- %f3, %loop3 = transform.structured.fuse_into_containing_op %reciprocal_sum into %loop2 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
- %f4, %loop4 = transform.structured.fuse_into_containing_op %reduce_sum into %loop3 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.iree.apply_cse %func : !transform.any_op
-
- %f5, %loop5 = transform.structured.fuse_into_containing_op %update into %loop4 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
- %f6, %loop6 = transform.structured.fuse_into_containing_op %partial_softmax into %loop5 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.iree.apply_cse %func : !transform.any_op
-
- %f7, %loop7 = transform.structured.fuse_into_containing_op %reduce_max into %loop6 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
- %f8, %loop8 = transform.structured.fuse_into_containing_op %promoted_first_matmul into %loop7 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
- %f9, %loop9 = transform.structured.fuse_into_containing_op %fill_op into %loop8 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.iree.apply_cse %func : !transform.any_op
-
- // Distribute fills and last truncate
- // ==========================================
- %fills = transform.merge_handles %acc_fill, %max_fill, %sum_fill, %last_truncate : !transform.any_op
- %tiled_fill, %fill_grid = transform.structured.tile_using_forall %fills tile_sizes[32] (mapping = [#gpu.warp<linear_dim_0>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
- // Vectorize function
- // ==========================================
- transform.apply_patterns to %func {
- transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
- transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
- transform.apply_patterns.vector.cast_away_vector_leading_one_dim
- } : !transform.any_op
- %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> (!transform.any_op)
-
- // Bufferization
- // ==========================================
- transform.apply_patterns to %func_3 {
- transform.apply_patterns.tensor.reassociative_reshape_folding
- transform.apply_patterns.canonicalization
- transform.apply_patterns.iree.fold_fill_into_pad
- transform.apply_patterns.linalg.tiling_canonicalization
- transform.apply_patterns.scf.for_loop_canonicalization
- } : !transform.any_op
- transform.iree.apply_cse %func_3 : !transform.any_op
- transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
- transform.apply_patterns to %func_3 { transform.apply_patterns.linalg.erase_unnecessary_inputs } : !transform.any_op
- %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
-
- // Step 5. Pre-process the contract and transfer ops to put it in the right form.
- // ===========================================================================
- %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %func_2 {
- transform.apply_patterns.iree.prepare_vector_to_mma
- } : !transform.any_op
-
- // Step 6. Post-bufferization vector distribution
- // ===========================================================================
- %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
- transform.iree.map_nested_forall_to_gpu_threads %func_7 workgroup_dims = [4, 8, 4] subgroup_size = 32 : (!transform.any_op) -> ()
-
- transform.apply_patterns to %func_7 {
- transform.apply_patterns.memref.fold_memref_alias_ops
- } : !transform.any_op
- transform.iree.apply_licm %func_7 : !transform.any_op
- transform.apply_patterns to %func_7 {
- transform.apply_patterns.canonicalization
- } : !transform.any_op
- transform.iree.apply_cse %func_7 : !transform.any_op
- %func_8 = transform.structured.hoist_redundant_vector_transfers %func_7
- : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %func_8 {
- transform.apply_patterns.canonicalization
- } : !transform.any_op
- transform.iree.apply_cse %func_8 : !transform.any_op
- transform.memref.erase_dead_alloc_and_stores %func_8 : (!transform.any_op) -> ()
-}
-
// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0] -> (s0 * 128)>
// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
// CHECK-DAG: #[[MAP2:.+]] = affine_map<()[s0, s1, s2] -> (s2 * 32 + ((s0 + s1 * 4) floordiv 32) * 32)>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_transform_spec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_transform_spec.mlir
new file mode 100644
index 0000000..ae62eb4
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_transform_spec.mlir
@@ -0,0 +1,128 @@
+
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.consumed}) {
+ // Get attention op
+ // ==========================================
+ %attention = transform.structured.match ops{["iree_linalg_ext.attention"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+
+ // Tile and distribute to workgroups
+ // ==========================================
+ %tiled_attention, %forall_grid =
+ transform.structured.tile_using_forall %attention tile_sizes [1, 128]
+ ( mapping = [#gpu.block<x>, #gpu.block<y>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
+
+ // Tile batch dimensions of attention
+ // ==========================================
+ %attention2 = transform.structured.match ops{["iree_linalg_ext.attention"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ %batch_tiled_attn, %loop = transform.structured.tile_using_for %attention2 [1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %top_level_func {
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.iree.apply_cse %top_level_func : !transform.any_op
+
+ // Promote query and output operands
+ // ==========================================
+ %attention3 = transform.structured.match ops{["iree_linalg_ext.attention"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ %promoted_attention, %alloc_a0, %alloc_a1 = transform.iree.promote_operands %attention3 [0, 3]
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+
+ // Tile and decompose attention
+ // ==========================================
+ %attention4 = transform.structured.match ops{["iree_linalg_ext.attention"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ %acc_fill, %max_fill, %sum_fill, %inner_loop,
+ %fill_op, %first_matmul, %reduce_max, %partial_softmax, %update, %reduce_sum, %reciprocal_sum, %softmax, %truncate, %scale_acc, %second_matmul, %last_truncate
+ = transform.tile_and_decompose_attention %attention4 :
+ (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+
+ // Promote key and value operands
+ // ==========================================
+ %promoted_first_matmul, %alloc0 = transform.iree.promote_operands %first_matmul [1]
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ %promoted_second_matmul, %alloc1 = transform.iree.promote_operands %second_matmul [1]
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+ // Tile and fuse attention ops
+ // ==========================================
+ %tiled_matmul, %forall = transform.structured.tile_using_forall %promoted_second_matmul tile_sizes [32] (mapping = [#gpu.warp<linear_dim_0>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+ %f0, %loop0 = transform.structured.fuse_into_containing_op %scale_acc into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ %f1, %loop1 = transform.structured.fuse_into_containing_op %truncate into %loop0 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ %f2, %loop2 = transform.structured.fuse_into_containing_op %softmax into %loop1 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+ %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ transform.iree.apply_cse %func : !transform.any_op
+
+ %f3, %loop3 = transform.structured.fuse_into_containing_op %reciprocal_sum into %loop2 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ %f4, %loop4 = transform.structured.fuse_into_containing_op %reduce_sum into %loop3 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.iree.apply_cse %func : !transform.any_op
+
+ %f5, %loop5 = transform.structured.fuse_into_containing_op %update into %loop4 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ %f6, %loop6 = transform.structured.fuse_into_containing_op %partial_softmax into %loop5 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.iree.apply_cse %func : !transform.any_op
+
+ %f7, %loop7 = transform.structured.fuse_into_containing_op %reduce_max into %loop6 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ %f8, %loop8 = transform.structured.fuse_into_containing_op %promoted_first_matmul into %loop7 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ %f9, %loop9 = transform.structured.fuse_into_containing_op %fill_op into %loop8 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.iree.apply_cse %func : !transform.any_op
+
+ // Distribute fills and last truncate
+ // ==========================================
+ %fills = transform.merge_handles %acc_fill, %max_fill, %sum_fill, %last_truncate : !transform.any_op
+ %tiled_fill, %fill_grid = transform.structured.tile_using_forall %fills tile_sizes[32] (mapping = [#gpu.warp<linear_dim_0>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+ // Vectorize function
+ // ==========================================
+ transform.apply_patterns to %func {
+ transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+ transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+ transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+ } : !transform.any_op
+ %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> (!transform.any_op)
+
+ // Bufferization
+ // ==========================================
+ transform.apply_patterns to %func_3 {
+ transform.apply_patterns.tensor.reassociative_reshape_folding
+ transform.apply_patterns.canonicalization
+ transform.apply_patterns.iree.fold_fill_into_pad
+ transform.apply_patterns.linalg.tiling_canonicalization
+ transform.apply_patterns.scf.for_loop_canonicalization
+ } : !transform.any_op
+ transform.iree.apply_cse %func_3 : !transform.any_op
+ transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+ transform.apply_patterns to %func_3 { transform.apply_patterns.linalg.erase_unnecessary_inputs } : !transform.any_op
+ %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
+
+ // Step 5. Pre-process the contract and transfer ops to put it in the right form.
+ // ===========================================================================
+ %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %func_2 {
+ transform.apply_patterns.iree.prepare_vector_to_mma
+ } : !transform.any_op
+
+ // Step 6. Post-bufferization vector distribution
+ // ===========================================================================
+ %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
+ transform.iree.map_nested_forall_to_gpu_threads %func_7 workgroup_dims = [4, 8, 4] subgroup_size = 32 : (!transform.any_op) -> ()
+
+ transform.apply_patterns to %func_7 {
+ transform.apply_patterns.memref.fold_memref_alias_ops
+ } : !transform.any_op
+ transform.iree.apply_licm %func_7 : !transform.any_op
+ transform.apply_patterns to %func_7 {
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.iree.apply_cse %func_7 : !transform.any_op
+ %func_8 = transform.structured.hoist_redundant_vector_transfers %func_7
+ : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %func_8 {
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.iree.apply_cse %func_8 : !transform.any_op
+ transform.memref.erase_dead_alloc_and_stores %func_8 : (!transform.any_op) -> ()
+ transform.yield
+ }
+} // module
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/create_async_groups.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/create_async_groups.mlir
index a716a2f..1fc2285 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/create_async_groups.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/create_async_groups.mlir
@@ -21,13 +21,15 @@
// CHECK: nvgpu.device_async_wait %[[G]]
return
}
+}
- transform.sequence failures(propagate) {
- ^bb1(%variant_op: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
%top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
transform.iree.create_async_groups %top_level_func {use_mma_sync} : (!transform.any_op) -> ()
+ transform.yield
}
-}
+} // module
// -----
@@ -53,13 +55,15 @@
// CHECK: nvgpu.device_async_wait %[[G]]
return
}
+}
- transform.sequence failures(propagate) {
- ^bb1(%variant_op: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
%top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
transform.iree.create_async_groups %top_level_func : (!transform.any_op) -> ()
+ transform.yield
}
-}
+} // module
// -----
@@ -80,15 +84,17 @@
vector.transfer_write %2, %0[%c0, %c4, %c0] {in_bounds = [true]} : vector<1xf32>, memref<4x32x16xf32, #gpu.address_space<workgroup>>
return
}
+}
- transform.sequence failures(propagate) {
- ^bb1(%variant_op: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
%top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
%vector_transfer = transform.structured.match ops{["memref.alloc"]} in %top_level_func : (!transform.any_op) -> !transform.any_op
// expected-error@below {{transform applied to the wrong op kind}}
transform.iree.create_async_groups %vector_transfer : (!transform.any_op) -> ()
+ transform.yield
}
-}
+} // module
// -----
@@ -112,13 +118,15 @@
// CHECK: nvgpu.device_async_wait %[[G]]
return
}
+}
- transform.sequence failures(propagate) {
- ^bb1(%variant_op: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
%top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
transform.iree.create_async_groups %top_level_func : (!transform.any_op) -> ()
+ transform.yield
}
-}
+} // module
// -----
@@ -148,10 +156,12 @@
// CHECK-NOT: nvgpu.device_async_create_group
return
}
+}
- transform.sequence failures(propagate) {
- ^bb1(%variant_op: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
%top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
transform.iree.create_async_groups %top_level_func : (!transform.any_op) -> ()
+ transform.yield
}
-}
+} // module
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/layout_analysis_and_distribution.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/layout_analysis_and_distribution.mlir
index 689e1d2..bc34439 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/layout_analysis_and_distribution.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/layout_analysis_and_distribution.mlir
@@ -1,6 +1,6 @@
// RUN: iree-opt %s -iree-transform-dialect-interpreter -transform-dialect-drop-schedule -cse -split-input-file --verify-diagnostics | FileCheck %s
-builtin.module {
+builtin.module attributes { transform.with_named_sequence } {
func.func @matmul_dispatch_0_matmul_16x8x16() {
%c0 = arith.constant 0 : index
%cst = arith.constant dense<0.000000e+00> : vector<16x8xf16>
@@ -17,12 +17,14 @@
vector.transfer_write %5, %2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<16x8xf16>
return
}
- transform.sequence failures(propagate) {
- ^bb1(%variant_op: !transform.any_op):
+
+ transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
%top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
%transformed_func = transform.iree.layout_analysis_and_distribution %top_level_func : (!transform.any_op) -> (!transform.any_op)
+ transform.yield
}
-}
+} // module
+
// CHECK-DAG: #[[MAP:.+]] = affine_map<(d0, d1, d2) -> (d1 + d2 * 16)>
// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d0 * 2)>
@@ -128,7 +130,7 @@
// -----
-builtin.module {
+builtin.module attributes { transform.with_named_sequence } {
func.func @matmul_reduction() {
%c0 = arith.constant 0 : index
%cst = arith.constant dense<0.000000e+00> : vector<16x8xf16>
@@ -149,12 +151,13 @@
vector.transfer_write %8, %2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<16x8xf16>
return
}
- transform.sequence failures(propagate) {
- ^bb1(%variant_op: !transform.any_op):
+ transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
%top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
%transformed_func = transform.iree.layout_analysis_and_distribution %top_level_func : (!transform.any_op) -> (!transform.any_op)
+ transform.yield
}
-}
+} // module
+
// CHECK-DAG: #[[MAP:.+]] = affine_map<(d0, d1, d2) -> (d1 + d2 * 16)>
// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d0 * 2)>
@@ -318,7 +321,7 @@
#map4 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map5 = affine_map<(d0, d1, d2) -> (d1, d2)>
#map6 = affine_map<(d0, d1, d2) -> (d0, d1)>
-builtin.module {
+builtin.module attributes { transform.with_named_sequence } {
func.func @matmul_scf() {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
@@ -348,12 +351,13 @@
vector.transfer_write %7, %3[%8, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<16x8xf16>
return
}
- transform.sequence failures(propagate) {
- ^bb1(%variant_op: !transform.any_op):
+ transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
%top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
%transformed_func = transform.iree.layout_analysis_and_distribution %top_level_func : (!transform.any_op) -> (!transform.any_op)
+ transform.yield
}
-}
+} // module
+
// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0] -> (s0 * 16)>
// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0] -> (d0 + s0)>
@@ -505,7 +509,7 @@
#map4 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map5 = affine_map<(d0, d1, d2) -> (d1, d2)>
#map6 = affine_map<(d0, d1, d2) -> (d0, d1)>
-builtin.module {
+builtin.module attributes { transform.with_named_sequence } {
func.func @matmul_scf() {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
@@ -535,12 +539,13 @@
vector.transfer_write %7, %3[%8, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<16x8xf16>
return
}
- transform.sequence failures(propagate) {
- ^bb1(%variant_op: !transform.any_op):
+ transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
%top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
%transformed_func = transform.iree.layout_analysis_and_distribution %top_level_func : (!transform.any_op) -> (!transform.any_op)
+ transform.yield
}
-}
+} // module
+
// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0] -> (s0 * 16)>
// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0) -> (d0 * 16)>
@@ -679,7 +684,7 @@
#map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map3 = affine_map<(d0, d1, d2) -> (d1, d2)>
#map4 = affine_map<(d0, d1, d2) -> (d0, d1)>
-builtin.module {
+builtin.module attributes { transform.with_named_sequence } {
func.func @matmul_dispatch_0_matmul_16x8x16() {
%c0 = arith.constant 0 : index
%cst = arith.constant dense<0.000000e+00> : vector<16x8xf16>
@@ -702,12 +707,13 @@
vector.transfer_write %10, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<16x8xf16, strided<[8, 1], offset: ?>>
return
}
- transform.sequence failures(propagate) {
- ^bb1(%variant_op: !transform.any_op):
+ transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
%top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
%transformed_func = transform.iree.layout_analysis_and_distribution %top_level_func : (!transform.any_op) -> (!transform.any_op)
+ transform.yield
}
-}
+} // module
+
// CHECK-DAG: #[[MAP:.+]] = affine_map<(d0, d1, d2) -> (d1 + d2 * 16)>
// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d0 * 2)>
@@ -836,7 +842,7 @@
// -----
-builtin.module {
+builtin.module attributes { transform.with_named_sequence } {
func.func @matmul_dispatch_0_matmul_16x8x16_shared() {
%c0 = arith.constant 0 : index
%cst = arith.constant dense<0.000000e+00> : vector<16x8xf16>
@@ -851,12 +857,12 @@
vector.transfer_write %5, %2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<16x8xf16>
return
}
- transform.sequence failures(propagate) {
- ^bb1(%variant_op: !transform.any_op):
+ transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
%top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
%transformed_func = transform.iree.layout_analysis_and_distribution %top_level_func : (!transform.any_op) -> (!transform.any_op)
+ transform.yield
}
-}
+} // module
// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2) -> (d1 + d2 * 16)>
// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d0 * 2)>
@@ -913,7 +919,7 @@
#map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map3 = affine_map<(d0, d1, d2) -> (d1, d2)>
#map4 = affine_map<(d0, d1, d2) -> (d0, d1)>
-builtin.module {
+builtin.module attributes { transform.with_named_sequence } {
func.func @matmul_dispatch_0_matmul_16x16x16_f16() {
%c0 = arith.constant 0 : index
%cst = arith.constant dense<0.000000e+00> : vector<16x16xf16>
@@ -973,13 +979,13 @@
vector.transfer_write %32, %subview[%c0_0, %c0_0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<16x8xf16, strided<[8, 1], offset: ?>>
return
}
- transform.sequence failures(propagate) {
- ^bb1(%variant_op: !transform.any_op):
+ transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
%top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
%reordered_func = transform.iree.reorder_transpose %top_level_func : (!transform.any_op) -> !transform.any_op
- transform.iree.apply_cse %reordered_func : !transform.any_op
+ transform.iree.apply_cse %reordered_func : !transform.any_op
+ transform.yield
}
-}
+} // module
// CHECK-DAG: #[[MAP:.+]] = affine_map<(d0) -> (d0 * 16)>
// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1) -> (d1, d0)>
@@ -1064,7 +1070,7 @@
#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>
-builtin.module {
+builtin.module attributes { transform.with_named_sequence } {
func.func @double_matmul_dispatch_0_matmul_16x16x16() {
%c0 = arith.constant 0 : index
%cst = arith.constant dense<0.000000e+00> : vector<16x16xf16>
@@ -1099,12 +1105,12 @@
vector.transfer_write %9, %subview[%c0_1, %c0_1] {in_bounds = [true, true]} : vector<16x8xf16>, memref<16x8xf16, strided<[8, 1], offset: ?>>
return
}
- transform.sequence failures(propagate) {
- ^bb1(%variant_op: !transform.any_op):
+ transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
%top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
%transformed_func = transform.iree.layout_analysis_and_distribution %top_level_func : (!transform.any_op) -> (!transform.any_op)
+ transform.yield
}
-}
+} // module
// CHECK-DAG: #[[MAP:.+]] = affine_map<(d0) -> (d0 * 16)>
// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d1 + d2 * 16)>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/linalg_transform.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/linalg_transform.mlir
index e6c4d1b..dff5d2c 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/linalg_transform.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/linalg_transform.mlir
@@ -1,11 +1,11 @@
// RUN: iree-opt %s --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-select-lowering-strategy, iree-llvmgpu-lower-executable-target)))" \
// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN: --iree-codegen-use-transform-dialect-strategy=%p/transform_dialect_codegen_bufferize_spec.mlir | \
+// RUN: --iree-codegen-transform-dialect-library=%p/transform_dialect_codegen_bufferize_spec.mlir | \
// RUN: FileCheck %s
// RUN: iree-opt %s --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-select-lowering-strategy, iree-llvmgpu-lower-executable-target)))" \
// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN: --iree-codegen-use-transform-dialect-strategy=%p/transform_dialect_codegen_foreach_to_gpu_spec.mlir | \
+// RUN: --iree-codegen-transform-dialect-library=%p/transform_dialect_codegen_foreach_to_gpu_spec.mlir | \
// RUN: FileCheck %s --check-prefix=FOREACH-TO-GPU
#device_target_cuda = #hal.device.target<"cuda", {executable_targets = [#hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_60"}>]}>
@@ -14,7 +14,11 @@
module attributes {hal.device.targets = [#device_target_cuda]} {
hal.executable private @matmul_static_dispatch_0 {
hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) {
- hal.executable.export public @matmul_static_dispatch_0 ordinal(0) layout(#pipeline_layout)
+ hal.executable.export public @matmul_static_dispatch_0 ordinal(0) layout(#pipeline_layout){
+ ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
+ %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
+ hal.return %x, %y, %z : index, index, index
+ }
builtin.module {
func.func @matmul_static_dispatch_0() {
%c0 = arith.constant 0 : index
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_batch_matmul.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_batch_matmul.mlir
index edbbfa4..839d389 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_batch_matmul.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_batch_matmul.mlir
@@ -81,7 +81,7 @@
}
-// CHECK: transform.sequence failures(propagate) {
+// CHECK: transform.named_sequence
// CHECK: transform.iree.register_match_callbacks
// CHECK: %[[MATCH:.+]]:2 = transform.iree.match_callback failures(propagate) "batch_matmul"
// CHECK: %[[TILED:.+]], %[[FORALL:.+]] = transform.structured.tile_using_forall %[[MATCH]]#1
@@ -98,7 +98,7 @@
// CHECK: %[[PADDED:.+]], %{{.*}}, %{{.+}} = transform.structured.pad %tiled_linalg_op
// CHECK: pack_paddings = [1, 1, 1, 1], pad_to_multiple_of = [1, 1, 1, 1], padding_dimensions = [0, 1, 2, 3]
// CHECK: padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]}
-// CHECK: %[[V3:.+]] = get_producer_of_operand %[[PADDED]][2]
+// CHECK: %[[V3:.+]] = transform.get_producer_of_operand %[[PADDED]][2]
// CHECK: transform.structured.hoist_pad %{{.*}} by 1 loops
// CHECK: apply_patterns
// CHECK: transform.iree.apply_licm
@@ -109,8 +109,8 @@
// CHECK: transform.iree.apply_cse
// CHECK: transform.structured.match ops{["tensor.parallel_insert_slice"]}
// CHECK: transform.structured.insert_slice_to_copy
-// CHECK: %[[LHS:.+]] = get_producer_of_operand %[[PADDED]][0]
-// CHECK: %[[RHS:.+]] = get_producer_of_operand %[[PADDED]][1]
+// CHECK: %[[LHS:.+]] = transform.get_producer_of_operand %[[PADDED]][0]
+// CHECK: %[[RHS:.+]] = transform.get_producer_of_operand %[[PADDED]][1]
// CHECK: %[[RHS_DPS:.+]] = transform.structured.rewrite_in_destination_passing_style %[[RHS]]
// CHECK: transform.structured.tile_using_forall %[[LHS]]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_convolution.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_convolution.mlir
index 0c18806..3345f41 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_convolution.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_convolution.mlir
@@ -29,10 +29,10 @@
// CHECK-LABEL: func @nchw_convolution
-// CHECK: transform.sequence failures(propagate) {
+// CHECK: transform.named_sequence
// CHECK: transform.iree.match_callback failures(propagate) "convolution"
// CHECK: transform.structured.convert_conv2d_to_img2col
-// CHECK: get_producer_of_operand %{{.*}}[0]
+// CHECK: transform.get_producer_of_operand %{{.*}}[0]
// CHECK: transform.apply_patterns.iree.bubble_collapse
// CHECK: transform.structured.tile_using_forall %{{.*}} tile_sizes [1, 128, 128](mapping = [#gpu.block<z>, #gpu.block<y>, #gpu.block<x>])
// CHECK: transform.structured.fuse_into_containing_op
@@ -44,10 +44,10 @@
// CHECK: transform.structured.fuse_into_containing_op
// CHECK: transform.structured.pad %{{.*}} {copy_back_op = "none", pack_paddings = [1, 0, 1], pad_to_multiple_of = [1, 1, 1, 1], padding_dimensions = [0, 1, 2, 3], padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]}
// CHECK: transform.structured.match ops{["linalg.fill"]}
-// CHECK: %[[RES:.+]] = get_producer_of_operand %{{.*}}[2]
+// CHECK: %[[RES:.+]] = transform.get_producer_of_operand %{{.*}}[2]
// CHECK: transform.structured.rewrite_in_destination_passing_style %[[RES]]
-// CHECK: %[[LHS:.+]] = get_producer_of_operand %{{.*}}[0]
-// CHECK: %[[RHS:.+]] = get_producer_of_operand %{{.*}}[1]
+// CHECK: %[[LHS:.+]] = transform.get_producer_of_operand %{{.*}}[0]
+// CHECK: %[[RHS:.+]] = transform.get_producer_of_operand %{{.*}}[1]
// CHECK: transform.structured.rewrite_in_destination_passing_style %[[LHS]]
// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [32, 4](mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
// CHECK: transform.structured.tile_using_forall %[[RHS]] num_threads [1, 4, 32](mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
@@ -100,13 +100,13 @@
// CHECK-LABEL: func @nhwc_convolution
-// CHECK: transform.sequence failures(propagate) {
+// CHECK: transform.named_sequence
// CHECK: transform.structured.tile_using_forall %{{.*}} tile_sizes [1, 128, 128](mapping = [#gpu.block<z>, #gpu.block<y>, #gpu.block<x>])
// CHECK: transform.structured.pad %{{.*}} {copy_back_op = "none", pack_paddings = [0, 1, 1], pad_to_multiple_of = [1, 1, 1, 1], padding_dimensions = [0, 1, 2, 3], padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]}
-// CHECK: %[[RES:.+]] = get_producer_of_operand %{{.*}}[2]
+// CHECK: %[[RES:.+]] = transform.get_producer_of_operand %{{.*}}[2]
// CHECK: transform.structured.rewrite_in_destination_passing_style %[[RES]]
-// CHECK: %[[LHS:.+]] = get_producer_of_operand %{{.*}}[0]
-// CHECK: %[[RHS:.+]] = get_producer_of_operand %{{.*}}[1]
+// CHECK: %[[LHS:.+]] = transform.get_producer_of_operand %{{.*}}[0]
+// CHECK: %[[RHS:.+]] = transform.get_producer_of_operand %{{.*}}[1]
// CHECK: transform.structured.rewrite_in_destination_passing_style %[[RHS]]
// CHECK: transform.structured.tile_using_forall %[[LHS]] num_threads [1, 32, 4](mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [4, 32](mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
@@ -148,4 +148,4 @@
// CHECK-LABEL: func @unaligned_convolution
// Currently padding on the img2col op is not supported so bail out for unaligned.
-// CHECK-NOT: transform.sequence
+// CHECK-NOT: transform.named_sequence
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_matmul.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_matmul.mlir
index 1c8017d..34e2eba 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_matmul.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_matmul.mlir
@@ -67,7 +67,7 @@
// CHECK-LABEL: func @matmul_1
-// CHECK: transform.sequence failures(propagate) {
+// CHECK: transform.named_sequence
// CHECK: transform.iree.match_callback failures(propagate) "matmul"
// CHECK: transform.structured.tile_using_forall %{{.*}} tile_sizes [128, 128](mapping = [#gpu.block<y>, #gpu.block<x>])
// CHECK: transform.structured.fuse_into_containing_op
@@ -127,7 +127,7 @@
// WITH_OPTIONS-LABEL: func @matmul_1
-// WITH_OPTIONS: transform.sequence failures(propagate) {
+// WITH_OPTIONS: transform.named_sequence
// WITH_OPTIONS: transform.iree.match_callback failures(propagate) "matmul"
// Tile sizes are set by td-matmul-strategy-blk-size-XX.
// WITH_OPTIONS: transform.structured.tile_using_forall %{{.*}} tile_sizes [256, 64](mapping = [#gpu.block<y>, #gpu.block<x>])
@@ -233,7 +233,7 @@
// CHECK-LABEL: func @matmul_2
-// CHECK: transform.sequence failures(propagate) {
+// CHECK: transform.named_sequence
// CHECK: transform.iree.match_callback failures(propagate) "matmul"
// CHECK: transform.structured.tile_using_forall %{{.*}} tile_sizes [128, 128](mapping = [#gpu.block<y>, #gpu.block<x>])
// CHECK: transform.iree.populate_workgroup_count_region_using_num_threads_slice
@@ -287,7 +287,7 @@
// CHECK-LABEL: func @matmul_3
-// CHECK: transform.sequence failures(propagate) {
+// CHECK: transform.named_sequence
// WITH_OPTIONS_2-LABEL: func @matmul_3
@@ -335,10 +335,10 @@
// CHECK }
// CHECK: transform.iree.apply_licm
// CHECK: transform.iree.apply_cse
-// CHECK: %[[RES_PAD:.+]] = get_producer_of_operand %{{.*}}[2]
+// CHECK: %[[RES_PAD:.+]] = transform.get_producer_of_operand %{{.*}}[2]
// CHECK: %[[RES_COPY:.+]] = transform.structured.rewrite_in_destination_passing_style %[[RES_PAD]]
-// CHECK: %[[LHS_PAD:.+]] = get_producer_of_operand %{{.*}}[0]
-// CHECK: %[[RHS_PAD:.+]] = get_producer_of_operand %{{.*}}[1]
+// CHECK: %[[LHS_PAD:.+]] = transform.get_producer_of_operand %{{.*}}[0]
+// CHECK: %[[RHS_PAD:.+]] = transform.get_producer_of_operand %{{.*}}[1]
// CHECK: %[[TILED_LHS:.+]], %{{.*}} = transform.structured.tile_using_forall %[[LHS_PAD]] num_threads [32, 4](mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
// CHECK: transform.structured.match ops{["scf.if"]}
// CHECK: transform.scf.take_assumed_branch %{{.*}} take_else_branch
@@ -409,10 +409,10 @@
// CHECK }
// CHECK: transform.iree.apply_licm
// CHECK: transform.iree.apply_cse
-// CHECK: %[[RES_PAD:.+]] = get_producer_of_operand %{{.*}}[2]
+// CHECK: %[[RES_PAD:.+]] = transform.get_producer_of_operand %{{.*}}[2]
// CHECK: %[[RES_COPY:.+]] = transform.structured.rewrite_in_destination_passing_style %[[RES_PAD]]
-// CHECK: %[[LHS_PAD:.+]] = get_producer_of_operand %{{.*}}[0]
-// CHECK: %[[RHS_PAD:.+]] = get_producer_of_operand %{{.*}}[1]
+// CHECK: %[[LHS_PAD:.+]] = transform.get_producer_of_operand %{{.*}}[0]
+// CHECK: %[[RHS_PAD:.+]] = transform.get_producer_of_operand %{{.*}}[1]
// CHECK: %[[LHS_COPY:.+]] = transform.structured.rewrite_in_destination_passing_style %[[LHS_PAD]]
// CHECK: %[[RHS_COPY:.+]] = transform.structured.rewrite_in_destination_passing_style %[[RHS_PAD]]
// CHECK: transform.structured.tile_using_forall %[[LHS_COPY]] num_threads [32, 4](mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
@@ -427,7 +427,8 @@
// Verify we don't go down the path without the flag.
// WITH_OPTIONS-LABEL: func @aligned_matmul
-// WITH_OPTIONS-NOT: transform.sequence failures(propagate) {
+// WITH_OPTIONS-NOT: transform.sequence
+// WITH_OPTIONS-NOT: transform.named_sequence
// WITH_OPTIONS_2-LABEL: func @aligned_matmul
@@ -472,7 +473,7 @@
// WITH_OPTIONS_3-LABEL: func @matmul_5_small
// SMALL-LABEL: func @matmul_5_small
-// SMALL: transform.sequence
+// SMALL: transform.named_sequence
// SMALL-NOT: mma
// SMALL-NOT: wmma
@@ -507,6 +508,7 @@
// CHECK: iree_codegen.translation_info<LLVMGPUMatmulSimt>
// CHECK-LABEL: func @f16_matmul
// CHECK-NOT: transform.sequence
+// CHECK-NOT: transform.named_sequence
// WITH_OPTIONS_2-LABEL: func @f16_matmul
@@ -542,18 +544,22 @@
}
// SMALL-LABEL: func @int8_matmul
-// SMALL: transform.sequence
+// SMALL: transform.named_sequence
// SMALL-NOT: mma
// SMALL-NOT: wmma
// CHECK-LABEL: func @int8_matmul
// CHECK-NOT: transform.sequence
+// CHECK-NOT: transform.named_sequence
// WITH_OPTIONS-LABEL: func @int8_matmul
// WITH_OPTIONS-NOT: transform.sequence
+// WITH_OPTIONS-NOT: transform.named_sequence
// WITH_OPTIONS_2-LABEL: func @int8_matmul
// WITH_OPTIONS_2-NOT: transform.sequence
+// WITH_OPTIONS_2-NOT: transform.named_sequence
// WITH_OPTIONS_3-LABEL: func @int8_matmul
// WITH_OPTIONS_3-NOT: transform.sequence
+// WITH_OPTIONS_3-NOT: transform.named_sequence
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_pad.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_pad.mlir
index 5168e75..118c0ac 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_pad.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_pad.mlir
@@ -44,7 +44,7 @@
}
// CHECK-LABEL: func @pad
-// CHECK: transform.sequence failures(propagate) {
+// CHECK: transform.named_sequence
// CHECK: transform.iree.register_match_callbacks
// CHECK: {{.*}} = transform.iree.match_callback failures(propagate) "pad"({{.*}}) : (!transform.any_op) -> !transform.any_op
// CHECK: transform.structured.tile_using_forall {{.*}} tile_sizes [64, 64](mapping = [#gpu.block<y>, #gpu.block<x>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_bufferize.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_bufferize.mlir
index 7066ae1..1769c4b 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_bufferize.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_bufferize.mlir
@@ -28,10 +28,13 @@
}
}
- transform.sequence failures(propagate) {
- ^bb1(%variant_op: !transform.any_op):
- transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
- %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op: (!transform.any_op) -> !transform.any_op
- %func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- }
+ builtin.module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.consumed}) {
+ transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+ %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op: (!transform.any_op) -> !transform.any_op
+ %func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ transform.yield
+ } // @__transform_main
+ } // module
+
}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_bufferize_spec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_bufferize_spec.mlir
index ed73b9d..472b508 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_bufferize_spec.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_bufferize_spec.mlir
@@ -1,6 +1,9 @@
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
- transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
- %variant_op_3 = transform.iree.bufferize %variant_op : (!transform.any_op) -> !transform.any_op
- %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-}
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(
+ %variant_op: !transform.any_op {transform.consumed}) {
+ transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+ %variant_op_3 = transform.iree.bufferize %variant_op : (!transform.any_op) -> !transform.any_op
+ %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ transform.yield
+ }
+} // module
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_foreach_to_gpu_spec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_foreach_to_gpu_spec.mlir
index d3df044..ebf730c 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_foreach_to_gpu_spec.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_foreach_to_gpu_spec.mlir
@@ -1,42 +1,46 @@
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
- %0 = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- %forall, %tiled_fill = transform.structured.tile_using_forall %0 num_threads [5, 1]
- ( mapping = [#gpu.thread<y>, #gpu.thread<x>] )
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(
+ %variant_op: !transform.any_op {transform.consumed}) {
+ %0 = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ %forall, %tiled_fill = transform.structured.tile_using_forall %0 num_threads [5, 1]
+ ( mapping = [#gpu.thread<y>, #gpu.thread<x>] )
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- %1 = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- %forall_2, %tiled_matmul = transform.structured.tile_using_forall %1 num_threads [7, 9]
- ( mapping = [#gpu.thread<x>, #gpu.thread<y>] )
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ %1 = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ %forall_2, %tiled_matmul = transform.structured.tile_using_forall %1 num_threads [7, 9]
+ ( mapping = [#gpu.thread<x>, #gpu.thread<y>] )
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- // Canonicalization/CSE is needed before bufferization otherwise unnecessary
- // allocs will be created.
- %func = transform.structured.match ops{["func.func"]} in %variant_op
- : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %func {
- transform.apply_patterns.iree.fold_fill_into_pad
- transform.apply_patterns.linalg.tiling_canonicalization
- transform.apply_patterns.scf.for_loop_canonicalization
- } : !transform.any_op
- transform.apply_patterns to %func {
- transform.apply_patterns.tensor.reassociative_reshape_folding
- transform.apply_patterns.canonicalization
- } : !transform.any_op
- transform.iree.apply_cse %func : !transform.any_op
- transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
- %variant_op_3 = transform.iree.bufferize %variant_op : (!transform.any_op) -> (!transform.any_op)
- %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- transform.iree.map_nested_forall_to_gpu_threads %memref_func
- workgroup_dims = [10, 11, 1] : (!transform.any_op) -> ()
+ // Canonicalization/CSE is needed before bufferization otherwise unnecessary
+ // allocs will be created.
+ %func = transform.structured.match ops{["func.func"]} in %variant_op
+ : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %func {
+ transform.apply_patterns.iree.fold_fill_into_pad
+ transform.apply_patterns.linalg.tiling_canonicalization
+ transform.apply_patterns.scf.for_loop_canonicalization
+ } : !transform.any_op
+ transform.apply_patterns to %func {
+ transform.apply_patterns.tensor.reassociative_reshape_folding
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.iree.apply_cse %func : !transform.any_op
+ transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+ %variant_op_3 = transform.iree.bufferize %variant_op : (!transform.any_op) -> (!transform.any_op)
+ %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ transform.iree.map_nested_forall_to_gpu_threads %memref_func
+ workgroup_dims = [10, 11, 1] : (!transform.any_op) -> ()
- // Late canonicalizations to cleanup and pass the checks
- transform.apply_patterns to %memref_func {
- transform.apply_patterns.iree.fold_fill_into_pad
- transform.apply_patterns.linalg.tiling_canonicalization
- transform.apply_patterns.scf.for_loop_canonicalization
- transform.apply_patterns.canonicalization
- } : !transform.any_op
- transform.iree.apply_licm %memref_func : !transform.any_op
- transform.iree.apply_cse %memref_func : !transform.any_op
-}
+ // Late canonicalizations to cleanup and pass the checks
+ transform.apply_patterns to %memref_func {
+ transform.apply_patterns.iree.fold_fill_into_pad
+ transform.apply_patterns.linalg.tiling_canonicalization
+ transform.apply_patterns.scf.for_loop_canonicalization
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.iree.apply_licm %memref_func : !transform.any_op
+ transform.iree.apply_cse %memref_func : !transform.any_op
+ transform.yield
+ }
+} // module
+
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_vector_distribution_spec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_vector_distribution_spec.mlir
index 90a2ee0..04474cd 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_vector_distribution_spec.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_vector_distribution_spec.mlir
@@ -1,23 +1,27 @@
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
- %if_op = transform.structured.match ops{["scf.if"]} in %variant_op
- : (!transform.any_op) -> !transform.any_op
- %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 }
- : (!transform.any_op) -> !transform.any_op
- %isolated = transform.get_parent_op %warp {isolated_from_above}
- : (!transform.any_op) -> !transform.any_op
- transform.iree.vector.warp_distribute %isolated
- : (!transform.any_op) -> ()
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
- // Late canonicalizations to cleanup and pass the checks.
- %func_op = transform.structured.match ops{["func.func"]} in %variant_op
- : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %func_op {
- transform.apply_patterns.iree.fold_fill_into_pad
- transform.apply_patterns.linalg.tiling_canonicalization
- transform.apply_patterns.scf.for_loop_canonicalization
- transform.apply_patterns.canonicalization
- } : !transform.any_op
- transform.iree.apply_licm %func_op : !transform.any_op
- transform.iree.apply_cse %func_op : !transform.any_op
-}
+ %if_op = transform.structured.match ops{["scf.if"]} in %variant_op
+ : (!transform.any_op) -> !transform.any_op
+ %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 }
+ : (!transform.any_op) -> !transform.any_op
+ %isolated = transform.get_parent_op %warp {isolated_from_above}
+ : (!transform.any_op) -> !transform.any_op
+ transform.iree.vector.warp_distribute %isolated
+ : (!transform.any_op) -> ()
+
+ // Late canonicalizations to cleanup and pass the checks.
+ %func_op = transform.structured.match ops{["func.func"]} in %variant_op
+ : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %func_op {
+ transform.apply_patterns.iree.fold_fill_into_pad
+ transform.apply_patterns.linalg.tiling_canonicalization
+ transform.apply_patterns.scf.for_loop_canonicalization
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.iree.apply_licm %func_op : !transform.any_op
+ transform.iree.apply_cse %func_op : !transform.any_op
+
+ transform.yield
+ }
+} // module
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_vector_warp_execute_on_lane_0_spec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_vector_warp_execute_on_lane_0_spec.mlir
index 93a8ae6..33fa29e 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_vector_warp_execute_on_lane_0_spec.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_vector_warp_execute_on_lane_0_spec.mlir
@@ -1,16 +1,19 @@
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
- %if_op = transform.structured.match ops{["scf.if"]} in %variant_op
- : (!transform.any_op) -> !transform.any_op
- transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 }
- : (!transform.any_op) -> !transform.any_op
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
- // Late canonicalizations to cleanup and pass the checks.
- %func_op = transform.structured.match ops{["func.func"]} in %variant_op
- : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %func_op {
- transform.apply_patterns.canonicalization
- } : !transform.any_op
- transform.iree.apply_licm %func_op : !transform.any_op
- transform.iree.apply_cse %func_op : !transform.any_op
-}
+ %if_op = transform.structured.match ops{["scf.if"]} in %variant_op
+ : (!transform.any_op) -> !transform.any_op
+ transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 }
+ : (!transform.any_op) -> !transform.any_op
+
+ // Late canonicalizations to cleanup and pass the checks.
+ %func_op = transform.structured.match ops{["func.func"]} in %variant_op
+ : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %func_op {
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.iree.apply_licm %func_op : !transform.any_op
+ transform.iree.apply_cse %func_op : !transform.any_op
+ transform.yield
+ }
+} // module
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_eliminate_gpu_barriers.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_eliminate_gpu_barriers.mlir
index 51ce0c9..4babefb 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_eliminate_gpu_barriers.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_eliminate_gpu_barriers.mlir
@@ -18,11 +18,13 @@
return
}
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
- %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
- transform.iree.eliminate_gpu_barriers %0 : (!transform.any_op) -> !transform.any_op
-}
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+ transform.iree.eliminate_gpu_barriers %0 : (!transform.any_op) -> !transform.any_op
+ transform.yield
+ } // @__transform_main
+} // module
// -----
@@ -44,11 +46,13 @@
return %2 : f32
}
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
- %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
- transform.iree.eliminate_gpu_barriers %0 : (!transform.any_op) -> !transform.any_op
-}
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+ transform.iree.eliminate_gpu_barriers %0 : (!transform.any_op) -> !transform.any_op
+ transform.yield
+ } // @__transform_main
+} // module
// -----
@@ -66,11 +70,13 @@
return
}
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
- %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
- transform.iree.eliminate_gpu_barriers %0 : (!transform.any_op) -> !transform.any_op
-}
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+ transform.iree.eliminate_gpu_barriers %0 : (!transform.any_op) -> !transform.any_op
+ transform.yield
+ } // @__transform_main
+} // module
// -----
@@ -95,12 +101,13 @@
return
}
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
- %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
- transform.iree.eliminate_gpu_barriers %0 : (!transform.any_op) -> !transform.any_op
-}
-
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+ transform.iree.eliminate_gpu_barriers %0 : (!transform.any_op) -> !transform.any_op
+ transform.yield
+ } // @__transform_main
+} // module
// -----
@@ -128,11 +135,13 @@
return
}
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
- %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
- transform.iree.eliminate_gpu_barriers %0 : (!transform.any_op) -> !transform.any_op
-}
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+ transform.iree.eliminate_gpu_barriers %0 : (!transform.any_op) -> !transform.any_op
+ transform.yield
+ } // @__transform_main
+} // module
// -----
@@ -151,11 +160,13 @@
return %0, %1 : memref<42xf32>, memref<10xf32>
}
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
- %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
- transform.iree.eliminate_gpu_barriers %0 : (!transform.any_op) -> !transform.any_op
-}
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+ transform.iree.eliminate_gpu_barriers %0 : (!transform.any_op) -> !transform.any_op
+ transform.yield
+ } // @__transform_main
+} // module
// -----
@@ -173,11 +184,13 @@
return %0 : memref<42xf32>
}
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
- %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
- transform.iree.eliminate_gpu_barriers %0 : (!transform.any_op) -> !transform.any_op
-}
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+ transform.iree.eliminate_gpu_barriers %0 : (!transform.any_op) -> !transform.any_op
+ transform.yield
+ } // @__transform_main
+} // module
// -----
@@ -193,11 +206,13 @@
return %0 : f32
}
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
- %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
- transform.iree.eliminate_gpu_barriers %0 : (!transform.any_op) -> !transform.any_op
-}
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+ transform.iree.eliminate_gpu_barriers %0 : (!transform.any_op) -> !transform.any_op
+ transform.yield
+ } // @__transform_main
+} // module
// -----
@@ -237,8 +252,10 @@
return %0, %1, %2, %3, %4 : f32, f32, f32, f32, f32
}
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
- %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
- transform.iree.eliminate_gpu_barriers %0 : (!transform.any_op) -> !transform.any_op
-}
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+ transform.iree.eliminate_gpu_barriers %0 : (!transform.any_op) -> !transform.any_op
+ transform.yield
+ } // @__transform_main
+} // module
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_hoist_allocs.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_hoist_allocs.mlir
index ef85f37..c6e3281 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_hoist_allocs.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_hoist_allocs.mlir
@@ -14,12 +14,14 @@
// CHECK-NEXT: ^bb1:
// CHECK-NEXT: return
-transform.sequence failures(propagate) {
-^bb1(%module: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%module: !transform.any_op {transform.readonly}) {
%func = transform.structured.match ops{["func.func"]} in %module
: (!transform.any_op) -> !transform.op<"func.func">
transform.iree.hoist_static_alloc %func : (!transform.op<"func.func">) -> ()
-}
+ transform.yield
+ } // @__transform_main
+} // module
// -----
@@ -45,12 +47,14 @@
// CHECK-NEXT: }
// CHECK-NEXT: memref.dealloc %[[ALLOC]] : memref<16x16xi32>
-transform.sequence failures(propagate) {
-^bb1(%module: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%module: !transform.any_op {transform.readonly}) {
%func = transform.structured.match ops{["func.func"]} in %module
: (!transform.any_op) -> !transform.op<"func.func">
transform.iree.hoist_static_alloc %func : (!transform.op<"func.func">) -> ()
-}
+ transform.yield
+ } // @__transform_main
+} // module
// -----
@@ -77,9 +81,11 @@
// CHECK-NEXT: }
// CHECK-NEXT: memref.dealloc %[[ALLOC]] : memref<16x16xi32>
-transform.sequence failures(propagate) {
-^bb1(%module: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%module: !transform.any_op {transform.readonly}) {
%func = transform.structured.match ops{["func.func"]} in %module
: (!transform.any_op) -> !transform.op<"func.func">
transform.iree.hoist_static_alloc %func : (!transform.op<"func.func">) -> ()
-}
+ transform.yield
+ } // @__transform_main
+} // module
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_pack_shared_memory_alloc.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_pack_shared_memory_alloc.mlir
index da0e7bc..504950e 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_pack_shared_memory_alloc.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_pack_shared_memory_alloc.mlir
@@ -25,9 +25,11 @@
return
}
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
%0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
transform.iree.pack_shared_memory_alloc %0 : (!transform.any_op) -> ()
transform.iree.apply_cse %0 : !transform.any_op
-}
+ transform.yield
+ } // @__transform_main
+} // module
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_promote_operands.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_promote_operands.mlir
index c36a1ff..757baf4 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_promote_operands.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_promote_operands.mlir
@@ -28,24 +28,26 @@
}
}
- transform.sequence failures(propagate) {
- ^bb1(%variant_op: !transform.any_op):
- %matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op
- : (!transform.any_op) -> !transform.any_op
- %promoted_matmul, %alloc_0, %alloc_1 =
- transform.iree.promote_operands %matmul [0, 1]
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+ builtin.module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
+ %matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op
+ : (!transform.any_op) -> !transform.any_op
+ %promoted_matmul, %alloc_0, %alloc_1 =
+ transform.iree.promote_operands %matmul [0, 1]
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
- // Late canonicalizations to cleanup and pass the checks.
- %func_op = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %func_op {
- transform.apply_patterns.iree.fold_fill_into_pad
- transform.apply_patterns.linalg.tiling_canonicalization
- transform.apply_patterns.scf.for_loop_canonicalization
- transform.apply_patterns.canonicalization
- } : !transform.any_op
- transform.iree.apply_licm %func_op : !transform.any_op
- transform.iree.apply_cse %func_op : !transform.any_op
- }
+ // Late canonicalizations to cleanup and pass the checks.
+ %func_op = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %func_op {
+ transform.apply_patterns.iree.fold_fill_into_pad
+ transform.apply_patterns.linalg.tiling_canonicalization
+ transform.apply_patterns.scf.for_loop_canonicalization
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.iree.apply_licm %func_op : !transform.any_op
+ transform.iree.apply_cse %func_op : !transform.any_op
+ transform.yield
+ } // @__transform_main
+ } // module
}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_vector_distribution.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_vector_distribution.mlir
index a162fca..b1c2065 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_vector_distribution.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_vector_distribution.mlir
@@ -1,8 +1,10 @@
-// RUN: iree-opt %s --pass-pipeline="builtin.module(hal.executable(iree-transform-dialect-interpreter{transform-file-name=%p/transform_dialect_codegen_vector_warp_execute_on_lane_0_spec.mlir}))" \
+// RUN: iree-opt %s --pass-pipeline="builtin.module(hal.executable(iree-transform-dialect-interpreter))" \
+// RUN: --iree-codegen-transform-dialect-library=%p/transform_dialect_codegen_vector_warp_execute_on_lane_0_spec.mlir \
// RUN: --allow-unregistered-dialect | \
// RUN: FileCheck %s --check-prefix=WARP-EXECUTE
-// RUN: iree-opt %s --pass-pipeline="builtin.module(hal.executable(iree-transform-dialect-interpreter{transform-file-name=%p/transform_dialect_codegen_vector_distribution_spec.mlir}))" \
+// RUN: iree-opt %s --pass-pipeline="builtin.module(hal.executable(iree-transform-dialect-interpreter))" \
+// RUN: --iree-codegen-transform-dialect-library=%p/transform_dialect_codegen_vector_distribution_spec.mlir \
// RUN: --allow-unregistered-dialect | \
// RUN: FileCheck %s
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_distribute_forall.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_distribute_forall.mlir
index 818a7fe..86b4321 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_distribute_forall.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_distribute_forall.mlir
@@ -46,24 +46,24 @@
} {mapping = [#gpu.warp<x>]}
return
}
- module {
- transform.sequence failures(propagate) {
- ^bb0(%variant_op: !transform.any_op):
- %17 = transform.structured.match ops{["func.func"]} in %variant_op
- : (!transform.any_op) -> !transform.any_op
- transform.iree.map_nested_forall_to_gpu_threads %17
- workgroup_dims = [256, 1, 1] subgroup_size = 32 : (!transform.any_op) -> ()
+ builtin.module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
+ %17 = transform.structured.match ops{["func.func"]} in %variant_op
+ : (!transform.any_op) -> !transform.any_op
+ transform.iree.map_nested_forall_to_gpu_threads %17
+ workgroup_dims = [256, 1, 1] subgroup_size = 32 : (!transform.any_op) -> ()
- // Late canonicalizations to cleanup and pass the checks.
- // Needs to occur on the whole variant to perform cse on the workgroup_count region
- %func_op = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %func_op {
- transform.apply_patterns.canonicalization
- } : !transform.any_op
- transform.iree.apply_licm %func_op : !transform.any_op
- transform.iree.apply_cse %func_op : !transform.any_op
- }
+ // Late canonicalizations to cleanup and pass the checks.
+ // Needs to occur on the whole variant to perform cse on the workgroup_count region
+ %func_op = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %func_op {
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.iree.apply_licm %func_op : !transform.any_op
+ transform.iree.apply_cse %func_op : !transform.any_op
+ transform.yield
+ } // @__transform_main
+ } // module
}
}
}
-}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_gpu_pipelining.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_gpu_pipelining.mlir
index b88f408..5bf464e 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_gpu_pipelining.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_gpu_pipelining.mlir
@@ -52,13 +52,16 @@
return
}
}
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
- %for = transform.structured.match ops{["scf.for"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- %1 = transform.cast %for : !transform.any_op to !transform.op<"scf.for">
- %2 = transform.iree.pipeline_shared_memory_copies %1 { depth = 4 } : (!transform.op<"scf.for">) -> !transform.op<"scf.for">
}
-}
+
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
+ %for = transform.structured.match ops{["scf.for"]} in %root : (!transform.any_op) -> !transform.any_op
+ %1 = transform.cast %for : !transform.any_op to !transform.op<"scf.for">
+ %2 = transform.iree.pipeline_shared_memory_copies %1 { depth = 4 } : (!transform.op<"scf.for">) -> !transform.op<"scf.for">
+ transform.yield
+ } // @__transform_main
+} // module
// CHECK-LABEL: func.func @matmul_pipelining
// CHECK: nvgpu.device_async_copy
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_vector_to_mma.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_vector_to_mma.mlir
index 6180d5f..6bfe345 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_vector_to_mma.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_vector_to_mma.mlir
@@ -47,22 +47,25 @@
return
}
}
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
- %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %func {
- transform.apply_patterns.iree.unroll_vectors_gpu_wmma_sync
- } : !transform.any_op
- transform.iree.vector.vector_to_mma_conversion %func { use_wmma } : (!transform.any_op) -> ()
+}
- // Apply canonicalization post-hoc to trigger DCE and pass the test
- // (i.e. all vector.contract are dead).
- // TODO: consider having the vector_to_mma_conversion do the DCE automatically.
- transform.apply_patterns to %func {
- transform.apply_patterns.canonicalization
- } : !transform.any_op
-}
-}
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
+ %func = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %func {
+ transform.apply_patterns.iree.unroll_vectors_gpu_wmma_sync
+ } : !transform.any_op
+ transform.iree.vector.vector_to_mma_conversion %func { use_wmma } : (!transform.any_op) -> ()
+
+ // Apply canonicalization post-hoc to trigger DCE and pass the test
+ // (i.e. all vector.contract are dead).
+ // TODO: consider having the vector_to_mma_conversion do the DCE automatically.
+ transform.apply_patterns to %func {
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.yield
+ } // @__transform_main
+} // module
// -----
@@ -128,8 +131,9 @@
return
}
}
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
+
+builtin.module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
%func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
transform.apply_patterns to %func {
transform.apply_patterns.iree.unroll_vectors_gpu_wmma_sync
@@ -138,5 +142,7 @@
transform.apply_patterns to %func {
transform.apply_patterns.canonicalization
} : !transform.any_op
-}
+ transform.yield
+ } // @__transform_main
+} // module
}
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/SPIRV/test/BUILD.bazel
index f6c559f..d183c2e 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/BUILD.bazel
@@ -6,8 +6,8 @@
# Tests for common transforms.
-load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
load("//build_tools/bazel:enforce_glob.bzl", "enforce_glob")
+load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
package(
features = ["layering_check"],
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/set_transform_strategy.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/set_transform_strategy.mlir
index 300398f..1aa48d0 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/set_transform_strategy.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/set_transform_strategy.mlir
@@ -45,7 +45,7 @@
// CHECK-LABEL: func @matmul
-// CHECK: transform.sequence failures(propagate) {
+// CHECK: transform.named_sequence
/// The specific vector sizes are tested in the LLVMGPU tests and thus omitted
/// here. This is just to check that masked vectorization is used.
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/Common.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/Common.cpp
index fea61a8..4a122f0 100644
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/Common.cpp
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/Common.cpp
@@ -9,6 +9,7 @@
#include "iree-dialects/Dialect/LinalgTransform/StructuredTransformOpsExt.h"
#include "iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.h"
#include "iree/compiler/Codegen/TransformStrategies/Common/AbstractReductionStrategy.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"
@@ -37,8 +38,8 @@
using transform::MatchOp;
using transform::MemRefEraseDeadAllocAndStoresOp;
using transform::MergeHandlesOp;
+using transform::NamedSequenceOp;
using transform::PrintOp;
-using transform::SequenceOp;
using transform::SplitHandleOp;
using transform::SplitReductionOp;
using transform::TileUsingForallOp;
@@ -96,17 +97,25 @@
OpBuilder b(ctx);
b.setInsertionPointAfter(entryPoint);
auto topLevelTransformModule = b.create<ModuleOp>(loc);
+ topLevelTransformModule->setAttr(
+ transform::TransformDialect::kWithNamedSequenceAttrName, b.getUnitAttr());
Region &topLevelTransformRegion = topLevelTransformModule.getBodyRegion();
b.setInsertionPointToStart(&topLevelTransformRegion.front());
auto anyOpType = transform::AnyOpType::get(b.getContext());
- auto sequence = b.create<transform::SequenceOp>(
- loc, TypeRange{}, transform::FailurePropagationMode::Propagate, anyOpType,
- [&](OpBuilder &b, Location loc, Value variantH) {
+ auto sequence = b.create<transform::NamedSequenceOp>(
+ loc,
+ /*symName=*/
+ std::string(
+ transform::TransformDialect::kTransformEntryPointSymbolName.str()),
+ /*rootType*/ anyOpType,
+ /*resultTypes=*/TypeRange{},
+ /*bodyBuilder=*/[&](OpBuilder &b, Location loc, Value variantH) {
ImplicitLocOpBuilder ib(loc, b);
buildStrategy(ib, variantH);
b.create<transform::YieldOp>(loc);
});
(void)sequence;
+
LDBG("transformation script:\n");
LDBG("verification: " << sequence.verify().succeeded() << "\n");
}
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/BUILD.bazel b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/BUILD.bazel
index ef00e97..8992c5f 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/BUILD.bazel
@@ -4,8 +4,8 @@
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
load("//build_tools/bazel:enforce_glob.bzl", "enforce_glob")
+load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
package(
features = ["layering_check"],
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/transform_dispatch_region_formation.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/transform_dispatch_region_formation.mlir
index 369a270..af63269 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/transform_dispatch_region_formation.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/transform_dispatch_region_formation.mlir
@@ -13,11 +13,13 @@
return %0 : tensor<?x?xf32>
}
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
- %0 = transform.structured.match ops{["tensor.extract_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
- transform.iree.wrap_in_dispatch_region %0 { generateWorkload = false } : (!transform.any_op) -> !transform.any_op
-}
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["tensor.extract_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ transform.iree.wrap_in_dispatch_region %0 { generateWorkload = false } : (!transform.any_op) -> !transform.any_op
+ transform.yield
+ } // @__transform_main
+} // module
// -----
@@ -41,13 +43,15 @@
return %0, %1 : tensor<?x?xf32>, tensor<?x?xf32>
}
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
- %0 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
- %dispatch_op = transform.iree.wrap_in_dispatch_region %0 { generateWorkload = false } : (!transform.any_op) -> !transform.any_op
- %1 = transform.structured.match ops{["test.dummy"]} in %arg1 : (!transform.any_op) -> !transform.any_op
- transform.iree.clone_preceding_op_into_dispatch_region %1 into %dispatch_op : (!transform.any_op, !transform.any_op) -> !transform.any_op
-}
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ %dispatch_op = transform.iree.wrap_in_dispatch_region %0 { generateWorkload = false } : (!transform.any_op) -> !transform.any_op
+ %1 = transform.structured.match ops{["test.dummy"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ transform.iree.clone_preceding_op_into_dispatch_region %1 into %dispatch_op : (!transform.any_op, !transform.any_op) -> !transform.any_op
+ transform.yield
+ } // @__transform_main
+} // module
// -----
@@ -71,13 +75,15 @@
return %1, %0 : tensor<?x?xf32>, tensor<?x?xf32>
}
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
- %0 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
- %dispatch_op = transform.iree.wrap_in_dispatch_region %0 { generateWorkload = false } : (!transform.any_op) -> !transform.any_op
- %1 = transform.structured.match ops{["tensor.extract_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
- transform.iree.move_preceding_op_into_dispatch_region %1 into %dispatch_op : (!transform.any_op, !transform.any_op) -> !transform.any_op
-}
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ %dispatch_op = transform.iree.wrap_in_dispatch_region %0 { generateWorkload = false } : (!transform.any_op) -> !transform.any_op
+ %1 = transform.structured.match ops{["tensor.extract_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ transform.iree.move_preceding_op_into_dispatch_region %1 into %dispatch_op : (!transform.any_op, !transform.any_op) -> !transform.any_op
+ transform.yield
+ } // @__transform_main
+} // module
// -----
@@ -95,12 +101,14 @@
return %matmul : tensor<5x5xf32>
}
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
- %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
- %region_op = transform.iree.wrap_in_dispatch_region %0 { generateWorkload = false } : (!transform.any_op) -> !transform.any_op
- transform.iree.region_to_workgroups %region_op : (!transform.any_op) -> !transform.any_op
-}
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ %region_op = transform.iree.wrap_in_dispatch_region %0 { generateWorkload = false } : (!transform.any_op) -> !transform.any_op
+ transform.iree.region_to_workgroups %region_op : (!transform.any_op) -> !transform.any_op
+ transform.yield
+ } // @__transform_main
+} // module
// -----
@@ -127,13 +135,15 @@
return %5 : tensor<?x?xf32>
}
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
- %0 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
- %dispatch_op = transform.iree.wrap_in_dispatch_region %0 { generateWorkload = false } : (!transform.any_op) -> !transform.any_op
- %1 = transform.structured.match attributes{"__tagged__"} in %arg1 : (!transform.any_op) -> !transform.any_op
- transform.iree.clone_preceding_op_into_dispatch_region %1 into %dispatch_op : (!transform.any_op, !transform.any_op) -> !transform.any_op
-}
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ %dispatch_op = transform.iree.wrap_in_dispatch_region %0 { generateWorkload = false } : (!transform.any_op) -> !transform.any_op
+ %1 = transform.structured.match attributes{"__tagged__"} in %arg1 : (!transform.any_op) -> !transform.any_op
+ transform.iree.clone_preceding_op_into_dispatch_region %1 into %dispatch_op : (!transform.any_op, !transform.any_op) -> !transform.any_op
+ transform.yield
+ } // @__transform_main
+} // module
// -----
@@ -157,13 +167,15 @@
return %1, %0 : tensor<?x?xf32>, tensor<?x?xf32>
}
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
- %0 = transform.structured.match ops{["tensor.extract_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
- %dispatch_op = transform.iree.wrap_in_dispatch_region %0 { generateWorkload = false } : (!transform.any_op) -> !transform.any_op
- %1 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
- transform.iree.move_succeeding_op_into_dispatch_region %1 into %dispatch_op : (!transform.any_op, !transform.any_op) -> !transform.any_op
-}
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["tensor.extract_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ %dispatch_op = transform.iree.wrap_in_dispatch_region %0 { generateWorkload = false } : (!transform.any_op) -> !transform.any_op
+ %1 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ transform.iree.move_succeeding_op_into_dispatch_region %1 into %dispatch_op : (!transform.any_op, !transform.any_op) -> !transform.any_op
+ transform.yield
+ } // @__transform_main
+} // module
// -----
@@ -190,13 +202,15 @@
return %5, %u : tensor<600x700xf32>, tensor<50x90xf32>
}
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
- %0 = transform.structured.match ops{["test.dummy_op"]} in %arg1 : (!transform.any_op) -> !transform.any_op
- %dispatch_op = transform.iree.wrap_in_dispatch_region %0 {generateWorkload=false} : (!transform.any_op) -> !transform.any_op
- %1 = transform.structured.match attributes{"__tagged__"} in %arg1 : (!transform.any_op) -> !transform.any_op
- transform.iree.move_succeeding_op_into_dispatch_region %1 into %dispatch_op : (!transform.any_op, !transform.any_op) -> !transform.any_op
-}
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["test.dummy_op"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ %dispatch_op = transform.iree.wrap_in_dispatch_region %0 {generateWorkload=false} : (!transform.any_op) -> !transform.any_op
+ %1 = transform.structured.match attributes{"__tagged__"} in %arg1 : (!transform.any_op) -> !transform.any_op
+ transform.iree.move_succeeding_op_into_dispatch_region %1 into %dispatch_op : (!transform.any_op, !transform.any_op) -> !transform.any_op
+ transform.yield
+ } // @__transform_main
+} // module
// -----
@@ -217,13 +231,15 @@
return %1, %0 : tensor<?x?xf32>, tensor<?x?xf32>
}
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
- %0 = transform.structured.match ops{["tensor.extract_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
- %dispatch_op = transform.iree.wrap_in_dispatch_region %0 {generateWorkload=false} : (!transform.any_op) -> !transform.any_op
- %1 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
- transform.iree.clone_succeeding_op_into_dispatch_region %1 into %dispatch_op : (!transform.any_op, !transform.any_op) -> !transform.any_op
-}
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["tensor.extract_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ %dispatch_op = transform.iree.wrap_in_dispatch_region %0 {generateWorkload=false} : (!transform.any_op) -> !transform.any_op
+ %1 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ transform.iree.clone_succeeding_op_into_dispatch_region %1 into %dispatch_op : (!transform.any_op, !transform.any_op) -> !transform.any_op
+ transform.yield
+ } // @__transform_main
+} // module
// -----
@@ -250,8 +266,10 @@
return %1 : tensor<4x?xf32>
}
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
- %0 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
- %dispatch_op = transform.iree.wrap_in_dispatch_region %0 { generateWorkload = false } : (!transform.any_op) -> !transform.any_op
-}
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ %dispatch_op = transform.iree.wrap_in_dispatch_region %0 { generateWorkload = false } : (!transform.any_op) -> !transform.any_op
+ transform.yield
+ } // @__transform_main
+} // module
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/BUILD.bazel b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/BUILD.bazel
index 64493c0..a8bb9f3 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/BUILD.bazel
@@ -4,8 +4,8 @@
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
load("//build_tools/bazel:enforce_glob.bzl", "enforce_glob")
+load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
package(
features = ["layering_check"],
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/BUILD.bazel b/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/BUILD.bazel
index d9b760a..fc34bff 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/BUILD.bazel
@@ -4,8 +4,8 @@
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
load("//build_tools/bazel:enforce_glob.bzl", "enforce_glob")
+load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
package(
features = ["layering_check"],
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/BUILD.bazel b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/BUILD.bazel
index 965b61f..1f2104a 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/BUILD.bazel
@@ -4,8 +4,8 @@
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
load("//build_tools/bazel:enforce_glob.bzl", "enforce_glob")
+load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
package(
features = ["layering_check"],
diff --git a/compiler/src/iree/compiler/GlobalOptimization/test/BUILD.bazel b/compiler/src/iree/compiler/GlobalOptimization/test/BUILD.bazel
index fa48db2..8d84f04 100644
--- a/compiler/src/iree/compiler/GlobalOptimization/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/GlobalOptimization/test/BUILD.bazel
@@ -4,8 +4,8 @@
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
load("//build_tools/bazel:enforce_glob.bzl", "enforce_glob")
+load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
package(
features = ["layering_check"],
diff --git a/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/test/BUILD.bazel b/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/test/BUILD.bazel
index 09466b8..4c424a9 100644
--- a/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/test/BUILD.bazel
@@ -4,8 +4,8 @@
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
load("//build_tools/bazel:enforce_glob.bzl", "enforce_glob")
+load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
package(
features = ["layering_check"],
diff --git a/runtime/src/iree/base/tracing/BUILD.bazel b/runtime/src/iree/base/tracing/BUILD.bazel
index 680b97a..6870606 100644
--- a/runtime/src/iree/base/tracing/BUILD.bazel
+++ b/runtime/src/iree/base/tracing/BUILD.bazel
@@ -4,8 +4,8 @@
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-load("//build_tools/bazel:build_defs.oss.bzl", "iree_runtime_cc_library")
load("@bazel_skylib//rules:common_settings.bzl", "string_flag")
+load("//build_tools/bazel:build_defs.oss.bzl", "iree_runtime_cc_library")
package(
default_visibility = ["//visibility:public"],
diff --git a/samples/transform_dialect/example_module.mlir b/samples/transform_dialect/example_module.mlir
index 2b9275a..c5eab46 100644
--- a/samples/transform_dialect/example_module.mlir
+++ b/samples/transform_dialect/example_module.mlir
@@ -107,13 +107,13 @@
}
/// We test first with threading off so that the printers are legible.
-// RUN: iree-compile %s --iree-hal-target-backends=vulkan \
-// RUN: --iree-codegen-use-transform-dialect-strategy=@transform_main \
-// RUN: --iree-codegen-transform-dialect-library=%p/transform_library.mlir \
-// RUN: --compile-from=executable-sources \
-// RUN: --compile-to=executable-targets \
-// RUN: --mlir-disable-threading | \
-// RUN: FileCheck %s --check-prefixes=CODEGEN-PRINTER
+// R-UN: iree-compile %s --iree-hal-target-backends=vulkan \
+// R-UN: --iree-codegen-use-transform-dialect-strategy=transform_main \
+// R-UN: --iree-codegen-transform-dialect-library=%p/transform_library.mlir \
+// R-UN: --compile-from=executable-sources \
+// R-UN: --compile-to=executable-targets \
+// R-UN: --mlir-disable-threading | \
+// R-UN: FileCheck %s --check-prefixes=CODEGEN-PRINTER
// CODEGEN-PRINTER: IR printer: Setting matmul strategy to default top-level
// CODEGEN-PRINTER: translation_info = #iree_codegen.translation_info<TransformDialectCodegen codegen_spec = @transform_main
diff --git a/tests/transform_dialect/cpu/attention.mlir b/tests/transform_dialect/cpu/attention.mlir
index 00591b1..9dd587f 100644
--- a/tests/transform_dialect/cpu/attention.mlir
+++ b/tests/transform_dialect/cpu/attention.mlir
@@ -9,7 +9,8 @@
}
// RUN: iree-compile %s --iree-hal-target-backends=llvm-cpu \
-// RUN: --iree-codegen-use-transform-dialect-strategy=%p/attention_codegen_spec.mlir | \
+// RUN: --iree-codegen-transform-dialect-library=%p/attention_codegen_spec.mlir \
+// RUN: --iree-codegen-use-transform-dialect-strategy=codegen | \
// RUN: iree-run-module --module=- --function=attention | \
// RUN: FileCheck %s --check-prefixes=EXEC
diff --git a/tests/transform_dialect/cpu/attention_codegen_spec.mlir b/tests/transform_dialect/cpu/attention_codegen_spec.mlir
index b1e4315..f31f50e 100644
--- a/tests/transform_dialect/cpu/attention_codegen_spec.mlir
+++ b/tests/transform_dialect/cpu/attention_codegen_spec.mlir
@@ -1,5 +1,8 @@
-transform.sequence failures(propagate) {
- ^bb0(%variant_op: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+
+ // Codegen.
+ transform.named_sequence @codegen(
+ %variant_op: !transform.any_op {transform.consumed}) {
// Get attention op
// ==========================================
@@ -18,7 +21,7 @@
// ==========================================
%attention2 = transform.structured.match ops{["iree_linalg_ext.attention"]} in %variant_op : (!transform.any_op) -> !transform.any_op
%acc_fill, %max_fill, %sum_fill, %inner_loop, %fill_op, %first_matmul, %reduce_max, %partial_softmax, %update, %reduce_sum,
- %reciprocal_sum, %softmax, %scale_acc, %second_matmul = tile_and_decompose_attention %attention2 :
+ %reciprocal_sum, %softmax, %scale_acc, %second_matmul = transform.tile_and_decompose_attention %attention2 :
(!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op,!transform.any_op, !transform.any_op, !transform.any_op)
// Vectorize function
@@ -58,4 +61,22 @@
} : !transform.any_op
transform.iree.apply_cse %func_8 : !transform.any_op
transform.memref.erase_dead_alloc_and_stores %func_8 : (!transform.any_op) -> ()
-}
+ transform.yield
+ } // codegen
+
+ // Find `hal.executable.variant`.
+ transform.named_sequence @match_variant_for_codegen(%root: !transform.any_op {transform.readonly})
+ -> !transform.any_op {
+ transform.match.operation_name %root ["hal.executable.variant"] : !transform.any_op
+ transform.yield %root : !transform.any_op
+ }
+
+ // Transform entry-point
+ transform.named_sequence @__transform_main(%root: !transform.any_op {transform.consumed}) {
+ transform.foreach_match in %root
+ @match_variant_for_codegen -> @codegen
+ : (!transform.any_op) -> (!transform.any_op)
+ transform.yield
+ }
+} // module
+
diff --git a/tests/transform_dialect/cpu/contraction-packing-and-dispatch.mlir b/tests/transform_dialect/cpu/contraction-packing-and-dispatch.mlir
index c041d46..dbafce8 100644
--- a/tests/transform_dialect/cpu/contraction-packing-and-dispatch.mlir
+++ b/tests/transform_dialect/cpu/contraction-packing-and-dispatch.mlir
@@ -9,13 +9,6 @@
// RUN: --iree-hal-configuration-pipeline | \
// RUN: FileCheck %s
-// Check that compilation runs all the way to the end.
-// TODO: this currently fails with:
-// 'memref.alloca' op all stack allocations need to be hoisted to the entry block of the function
-//
-// R-UN: iree-opt %s --iree-transform-dialect-interpreter --transform-dialect-drop-schedule | \
-// R-UN: iree-compile --iree-hal-target-backends=llvm-cpu
-
!a_tensor_t = tensor<1234x567xf32>
!b_tensor_t = tensor<567x890xf32>
!c_tensor_t = tensor<1234x890xf32>
@@ -57,14 +50,15 @@
// CHECK-LABEL: func.func @matmul_dispatch_4
// CHECK: tensor.unpack
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) {
+ %matmul = transform.structured.match interface{LinalgOp} in %module_op
+ : (!transform.any_op) -> (!transform.any_op)
-transform.sequence failures(propagate) {
-^bb1(%module_op: !transform.any_op):
- %matmul = transform.structured.match interface{LinalgOp} in %module_op
- : (!transform.any_op) -> (!transform.any_op)
-
- transform.structured.pack_greedily %matmul
- matmul_packed_sizes = [8, 16, 32]
- matmul_inner_dims_order = [0, 1, 2]
- : (!transform.any_op) -> !transform.op<"linalg.generic">
-}
+ transform.structured.pack_greedily %matmul
+ matmul_packed_sizes = [8, 16, 32]
+ matmul_inner_dims_order = [0, 1, 2]
+ : (!transform.any_op) -> !transform.op<"linalg.generic">
+ transform.yield
+ }
+} // module
diff --git a/tests/transform_dialect/cpu/contraction-packing.mlir b/tests/transform_dialect/cpu/contraction-packing.mlir
index 2ab912f..d103303 100644
--- a/tests/transform_dialect/cpu/contraction-packing.mlir
+++ b/tests/transform_dialect/cpu/contraction-packing.mlir
@@ -136,16 +136,19 @@
return %0 : !ct_tensor_t
}
-transform.sequence failures(propagate) {
-^bb1(%module_op: !transform.any_op):
- %matmul = transform.structured.match interface{LinalgOp} in %module_op
- : (!transform.any_op) -> (!transform.any_op)
-
- // Generalized packing rewrite extracts a gemm from any linalg op that contains
- // one. This acts as a powerful normalization step: after this point, we have a
- // gemm (i.e. 3-D contraction with (m,n,k)=(8,16,32) ) on the 3 most minor
- // dimensions.
- transform.structured.pack_greedily %matmul
- matmul_packed_sizes = [8, 16, 32] matmul_inner_dims_order = [0, 1, 2]
- : (!transform.any_op) -> !transform.op<"linalg.generic">
-}
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) {
+ %matmul = transform.structured.match interface{LinalgOp} in %module_op
+ : (!transform.any_op) -> (!transform.any_op)
+
+ // Generalized packing rewrite extracts a gemm from any linalg op that contains
+ // one. This acts as a powerful normalization step: after this point, we have a
+ // gemm (i.e. 3-D contraction with (m,n,k)=(8,16,32) ) on the 3 most minor
+ // dimensions.
+ transform.structured.pack_greedily %matmul
+ matmul_packed_sizes = [8, 16, 32] matmul_inner_dims_order = [0, 1, 2]
+ : (!transform.any_op) -> !transform.op<"linalg.generic">
+ transform.yield
+ }
+} // module
+
diff --git a/tests/transform_dialect/cpu/fold_tensor_slice_into_transfer.mlir b/tests/transform_dialect/cpu/fold_tensor_slice_into_transfer.mlir
index 8e10591..18c675a 100644
--- a/tests/transform_dialect/cpu/fold_tensor_slice_into_transfer.mlir
+++ b/tests/transform_dialect/cpu/fold_tensor_slice_into_transfer.mlir
@@ -4,13 +4,6 @@
// added to IREE in https://github.com/openxla/iree/pull/14373, as a workaround
// for other patterns being sensitive to these exact transforms.
-transform.sequence failures(propagate) {
-^bb1(%func_op: !transform.op<"func.func">):
- transform.apply_patterns to %func_op {
- transform.apply_patterns.iree.fold_tensor_slice_into_transfer
- } : !transform.op<"func.func">
-}
-
// CHECK-LABEL: func @transfer_read_of_extract_slice(
// CHECK-SAME: %[[t:.*]]: tensor<?x?xf32>, %[[s1:.*]]: index, %[[s2:.*]]: index
// CHECK-DAG: %[[c4:.*]] = arith.constant 4 : index
@@ -106,3 +99,12 @@
%1 = tensor.insert_slice %0 into %t1[4, 3, %s] [1, 5, 6] [1, 1, 1] : tensor<5x6xf32> into tensor<?x?x12xf32>
return %1 : tensor<?x?x12xf32>
}
+
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%func_op: !transform.op<"func.func"> {transform.readonly}) {
+ transform.apply_patterns to %func_op {
+ transform.apply_patterns.iree.fold_tensor_slice_into_transfer
+ } : !transform.op<"func.func">
+ transform.yield
+ }
+} // module
diff --git a/tests/transform_dialect/cpu/matmul.mlir b/tests/transform_dialect/cpu/matmul.mlir
index 8bc4c7f..63d059e 100644
--- a/tests/transform_dialect/cpu/matmul.mlir
+++ b/tests/transform_dialect/cpu/matmul.mlir
@@ -16,7 +16,8 @@
// RUN: --iree-stream-transformation-pipeline \
// RUN: --iree-hal-configuration-pipeline | \
// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmcpu-lower-executable-target)))' \
-// RUN: --iree-codegen-use-transform-dialect-strategy=%p/matmul_codegen_default_spec.mlir | \
+// RUN: --iree-codegen-transform-dialect-library=%p/matmul_codegen_default_spec.mlir \
+// RUN: --iree-codegen-use-transform-dialect-strategy=codegen | \
// RUN: FileCheck %s --check-prefixes=CODEGEN-DEFAULT
// CODEGEN-DEFAULT: hal.executable.export public @matmul_static_dispatch_0_matmul_3x3x5
diff --git a/tests/transform_dialect/cpu/matmul_codegen_default_spec.mlir b/tests/transform_dialect/cpu/matmul_codegen_default_spec.mlir
index b84a4ba..df5231d 100644
--- a/tests/transform_dialect/cpu/matmul_codegen_default_spec.mlir
+++ b/tests/transform_dialect/cpu/matmul_codegen_default_spec.mlir
@@ -1,24 +1,27 @@
// RUN: iree-opt %s
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
- %matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @codegen(
+ %variant_op: !transform.any_op {transform.consumed}) {
+ %matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- // Step 1. Tile to forall with tile_sizes [2].
- // ===================================================
- %tiled_generic, %forall =
- transform.structured.tile_using_forall %matmul tile_sizes [2]
- ( mapping = [#gpu.block<x>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall
- : (!transform.any_op) -> ()
+ // Step 1. Tile to forall with tile_sizes [2].
+ // ===================================================
+ %tiled_generic, %forall =
+ transform.structured.tile_using_forall %matmul tile_sizes [2]
+ ( mapping = [#gpu.block<x>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall
+ : (!transform.any_op) -> ()
- // Step 2. Bufferize and drop HAL decriptor from memref ops.
- // =========================================================
- transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
- %variant_op_3 = transform.iree.bufferize %variant_op : (!transform.any_op) -> !transform.any_op
+ // Step 2. Bufferize and drop HAL decriptor from memref ops.
+ // =========================================================
+ transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+ %variant_op_3 = transform.iree.bufferize %variant_op : (!transform.any_op) -> !transform.any_op
- // Step 3. Post-bufferization mapping workgroup.
- // =========================================================
- %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> ()
-}
+ // Step 3. Post-bufferization mapping workgroup.
+ // =========================================================
+ %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> ()
+ transform.yield
+ }
+} // module
diff --git a/tests/transform_dialect/cpu/matmul_library_call.mlir b/tests/transform_dialect/cpu/matmul_library_call.mlir
index 5dd24db..211a598 100644
--- a/tests/transform_dialect/cpu/matmul_library_call.mlir
+++ b/tests/transform_dialect/cpu/matmul_library_call.mlir
@@ -13,7 +13,7 @@
}
// RUN: iree-compile %s --iree-hal-target-backends=llvm-cpu \
-// RUN: --iree-codegen-use-transform-dialect-strategy=@custom_matmul \
+// RUN: --iree-codegen-use-transform-dialect-strategy=custom_matmul \
// RUN: --iree-codegen-transform-dialect-library=%p/transform_library.mlir \
// RUN: --compile-to=executable-targets | \
// RUN: FileCheck %s --check-prefixes=CODEGEN-DEFAULT
@@ -25,7 +25,7 @@
// RUN: iree-compile %s --iree-hal-target-backends=llvm-cpu \
// RUN: --iree-codegen-transform-dialect-library=%p/transform_library.mlir \
-// RUN: --iree-codegen-use-transform-dialect-strategy=@custom_matmul | \
+// RUN: --iree-codegen-use-transform-dialect-strategy=custom_matmul | \
// RUN: iree-run-module --module=- --function=matmul_static \
// RUN: --input="3x5xf32=1" \
// RUN: --input="5x3xf32=2" \
diff --git a/tests/transform_dialect/cuda/BUILD.bazel b/tests/transform_dialect/cuda/BUILD.bazel
index 8615fab..a7903fa 100644
--- a/tests/transform_dialect/cuda/BUILD.bazel
+++ b/tests/transform_dialect/cuda/BUILD.bazel
@@ -35,7 +35,6 @@
"softmax_v2.mlir",
# First few ops of softmax only, acts as a proxy example.
"softmax_partial.mlir",
- "vecadd2d.mlir",
],
cfg = "//tests:lit.cfg.py",
# transform dialect spec files are MLIR files that specify a transformation,
@@ -57,8 +56,6 @@
"softmax_dispatch_spec.mlir",
# First few ops of softmax only, acts as a proxy example.
"softmax_partial_codegen_spec.mlir",
- "vecadd2d_codegen_spec.mlir",
- "vecadd2d_codegen_spec_partial_tile.mlir",
],
tags = [
# CUDA cuInit fails with sanitizer on.
diff --git a/tests/transform_dialect/cuda/CMakeLists.txt b/tests/transform_dialect/cuda/CMakeLists.txt
index b2d4d1d..ab23d81 100644
--- a/tests/transform_dialect/cuda/CMakeLists.txt
+++ b/tests/transform_dialect/cuda/CMakeLists.txt
@@ -26,7 +26,6 @@
"softmax.mlir"
"softmax_partial.mlir"
"softmax_v2.mlir"
- "vecadd2d.mlir"
TOOLS
FileCheck
iree-compile
@@ -43,8 +42,6 @@
softmax_dispatch_spec.mlir
softmax_partial_codegen_spec.mlir
softmax_v2_codegen_spec.mlir
- vecadd2d_codegen_spec.mlir
- vecadd2d_codegen_spec_partial_tile.mlir
LABELS
"noasan"
"nomsan"
diff --git a/tests/transform_dialect/cuda/double_mma_layout_analysis.mlir b/tests/transform_dialect/cuda/double_mma_layout_analysis.mlir
index 93b143c..cb53367 100644
--- a/tests/transform_dialect/cuda/double_mma_layout_analysis.mlir
+++ b/tests/transform_dialect/cuda/double_mma_layout_analysis.mlir
@@ -15,7 +15,8 @@
// RUN: --iree-hal-cuda-llvm-target-arch=sm_80 \
// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
// RUN: --iree-flow-dispatch-use-transform-dialect=%p/double_mma_layout_analysis_dispatch_spec.mlir \
-// RUN: --iree-codegen-use-transform-dialect-strategy=%p/double_mma_layout_analysis_codegen_spec.mlir | \
+// RUN: --iree-codegen-transform-dialect-library=%p/double_mma_layout_analysis_codegen_spec.mlir \
+// RUN: --iree-codegen-use-transform-dialect-strategy=codegen | \
// RUN: iree-run-module --module=- --function=double_matmul --device=cuda \
// RUN: --input="16x16xf16=[[0.0999755859375,0.2249755859375,0.07501220703125,0.0,0.07501220703125,0.2249755859375,0.175048828125,0.07501220703125,0.175048828125,0.07501220703125,0.024993896484375,0.1500244140625,0.1500244140625,0.2249755859375,0.199951171875,0.1500244140625],[0.1500244140625,0.199951171875,0.0999755859375,0.07501220703125,0.1500244140625,0.2249755859375,0.024993896484375,0.0999755859375,0.0999755859375,0.024993896484375,0.2249755859375,0.2249755859375,0.2249755859375,0.0,0.024993896484375,0.04998779296875],[0.07501220703125,0.0,0.125,0.125,0.04998779296875,0.2249755859375,0.024993896484375,0.199951171875,0.199951171875,0.07501220703125,0.1500244140625,0.2249755859375,0.024993896484375,0.175048828125,0.07501220703125,0.125],[0.04998779296875,0.024993896484375,0.0,0.2249755859375,0.07501220703125,0.024993896484375,0.024993896484375,0.0,0.07501220703125,0.1500244140625,0.1500244140625,0.175048828125,0.2249755859375,0.1500244140625,0.07501220703125,0.0999755859375],[0.125,0.0,0.199951171875,0.04998779296875,0.199951171875,0.04998779296875,0.175048828125,0.125,0.0,0.0,0.199951171875,0.024993896484375,0.2249755859375,0.1500244140625,0.024993896484375,0.0],[0.04998779296875,0.2249755859375,0.0999755859375,0.07501220703125,0.2249755859375,0.07501220703125,0.2249755859375,0.07501220703125,0.2249755859375,0.199951171875,0.125,0.07501220703125,0.04998779296875,0.199951171875,0.125,0.1500244140625],[0.1500244140625,0.125,0.175048828125,0.04998779296875,0.125,0.1500244140625,0.1500244140625,0.125,0.0999755859375,0.0,0.199951171875,0.024993896484375,0.175048828125,0.199951171875,0.125,0.0999755859375],[0.0999755859375,0.199951171875,0.0999755859375,0.0999755859375,0.2249755859375,0.0,0.175048828125,0.0999755859375,0.125,0.07501220703125,0.07501220703125,0.175048828125,0.07501220703125,0.0,0.2249755859375,0.2249755859375],[0.07501220703125,0.024993896484375,0.199951171875,0.024993896484375,0.175048828125,0.199951171875,0.0999755859375,0.024993896484375,0.0,0.0999755859375,0.0,0.0999755859375,0.2249755859375,0.175048828125,0.0,0.0],[0.024993896484375,0.0999755859375,0.2249755859375,0.2249755859375,0.125,0.2249755859375,0.04998779296875,0.04998779296875,0.04998779296875,0.024993896484375,0.0999755859375,0.2249755859375,0.024993896484375,0.024993896484375,0.0,0.07501220703125],[0.0,0.1500244140625,0.175048828125,0.1500244140625,0.2249755859375,0.024993896484375,0.1500244140625,0.0999755859375,0.024993896484375,0.0,0.125,0.04998779296875,0.125,0.199951171875,0.024993896484375,0.199951171875],[0.024993896484375,0.04998779296875,0.199951171875,0.0,0.07501220703125,0.199951171875,0.2249755859375,0.04998779296875,0.175048828125,0.0,0.199951171875,0.199951171875,0.1500244140625,0.199951171875,0.125,0.199951171875],[0.1500244140625,0.125,0.04998779296875,0.0999755859375,0.04998779296875,0.175048828125,0.04998779296875,0.0999755859375,0.2249755859375,0.199951171875,0.125,0.1500244140625,0.0999755859375,0.07501220703125,0.07501220703125,0.0999755859375],[0.0,0.04998779296875,0.125,0.024993896484375,0.04998779296875,0.199951171875,0.04998779296875,0.0999755859375,0.199951171875,0.07501220703125,0.1500244140625,0.125,0.199951171875,0.199951171875,0.0,0.125],[0.024993896484375,0.07501220703125,0.0,0.199951171875,0.024993896484375,0.024993896484375,0.024993896484375,0.175048828125,0.04998779296875,0.04998779296875,0.04998779296875,0.07501220703125,0.07501220703125,0.1500244140625,0.175048828125,0.199951171875],[0.0,0.125,0.0,0.07501220703125,0.125,0.125,0.07501220703125,0.1500244140625,0.04998779296875,0.04998779296875,0.125,0.125,0.2249755859375,0.0999755859375,0.07501220703125,0.07501220703125]]" \
// RUN: --input="16x16xf16=[[0.175048828125,0.07501220703125,0.199951171875,0.0,0.175048828125,0.125,0.199951171875,0.04998779296875,0.0999755859375,0.175048828125,0.07501220703125,0.04998779296875,0.125,0.125,0.07501220703125,0.2249755859375],[0.024993896484375,0.199951171875,0.0,0.1500244140625,0.175048828125,0.0999755859375,0.175048828125,0.1500244140625,0.2249755859375,0.07501220703125,0.199951171875,0.0999755859375,0.0999755859375,0.2249755859375,0.0999755859375,0.0999755859375],[0.2249755859375,0.2249755859375,0.125,0.175048828125,0.0,0.07501220703125,0.04998779296875,0.0,0.199951171875,0.1500244140625,0.024993896484375,0.2249755859375,0.024993896484375,0.1500244140625,0.2249755859375,0.199951171875],[0.1500244140625,0.125,0.024993896484375,0.07501220703125,0.125,0.125,0.07501220703125,0.1500244140625,0.04998779296875,0.175048828125,0.125,0.175048828125,0.175048828125,0.07501220703125,0.024993896484375,0.125],[0.2249755859375,0.125,0.2249755859375,0.1500244140625,0.0,0.0,0.1500244140625,0.125,0.024993896484375,0.125,0.0,0.024993896484375,0.175048828125,0.175048828125,0.024993896484375,0.125],[0.2249755859375,0.024993896484375,0.04998779296875,0.0,0.0,0.1500244140625,0.07501220703125,0.2249755859375,0.1500244140625,0.024993896484375,0.0,0.0999755859375,0.125,0.1500244140625,0.2249755859375,0.0],[0.125,0.0999755859375,0.0,0.0999755859375,0.199951171875,0.125,0.175048828125,0.175048828125,0.1500244140625,0.2249755859375,0.04998779296875,0.125,0.1500244140625,0.0,0.0,0.0999755859375],[0.125,0.07501220703125,0.175048828125,0.1500244140625,0.175048828125,0.0,0.04998779296875,0.125,0.125,0.024993896484375,0.0999755859375,0.175048828125,0.024993896484375,0.0,0.024993896484375,0.0],[0.2249755859375,0.024993896484375,0.0999755859375,0.04998779296875,0.125,0.07501220703125,0.0999755859375,0.024993896484375,0.125,0.125,0.125,0.024993896484375,0.125,0.04998779296875,0.0999755859375,0.07501220703125],[0.0999755859375,0.175048828125,0.199951171875,0.0999755859375,0.175048828125,0.07501220703125,0.024993896484375,0.125,0.07501220703125,0.0,0.125,0.07501220703125,0.07501220703125,0.0,0.199951171875,0.175048828125],[0.07501220703125,0.0999755859375,0.175048828125,0.07501220703125,0.125,0.1500244140625,0.0,0.0999755859375,0.2249755859375,0.199951171875,0.04998779296875,0.0,0.0,0.1500244140625,0.199951171875,0.2249755859375],[0.024993896484375,0.2249755859375,0.04998779296875,0.1500244140625,0.2249755859375,0.2249755859375,0.175048828125,0.0999755859375,0.024993896484375,0.199951171875,0.125,0.199951171875,0.175048828125,0.2249755859375,0.175048828125,0.0999755859375],[0.125,0.0999755859375,0.04998779296875,0.125,0.199951171875,0.07501220703125,0.199951171875,0.0,0.024993896484375,0.04998779296875,0.0,0.04998779296875,0.04998779296875,0.199951171875,0.1500244140625,0.0999755859375],[0.199951171875,0.0,0.125,0.04998779296875,0.07501220703125,0.175048828125,0.0999755859375,0.175048828125,0.024993896484375,0.07501220703125,0.0,0.1500244140625,0.07501220703125,0.024993896484375,0.07501220703125,0.175048828125],[0.1500244140625,0.125,0.0999755859375,0.175048828125,0.04998779296875,0.0,0.04998779296875,0.1500244140625,0.024993896484375,0.125,0.125,0.175048828125,0.125,0.0999755859375,0.175048828125,0.1500244140625],[0.07501220703125,0.199951171875,0.024993896484375,0.0999755859375,0.175048828125,0.07501220703125,0.1500244140625,0.04998779296875,0.0,0.024993896484375,0.07501220703125,0.07501220703125,0.1500244140625,0.04998779296875,0.2249755859375,0.1500244140625]]" \
diff --git a/tests/transform_dialect/cuda/double_mma_layout_analysis_codegen_spec.mlir b/tests/transform_dialect/cuda/double_mma_layout_analysis_codegen_spec.mlir
index 11a6372..9982106 100644
--- a/tests/transform_dialect/cuda/double_mma_layout_analysis_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/double_mma_layout_analysis_codegen_spec.mlir
@@ -1,69 +1,73 @@
// RUN: iree-opt %s
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @codegen(
+ %variant_op: !transform.any_op {transform.consumed}) {
- // Step 1. Find the fill and matmul ops
- // ===========================================================================
- %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- %matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- %fill0, %fill1 = transform.split_handle %fill : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- %matmul0, %matmul1 = transform.split_handle %matmul : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ // Step 1. Find the fill and matmul ops
+ // ===========================================================================
+ %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ %matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ %fill0, %fill1 = transform.split_handle %fill : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ %matmul0, %matmul1 = transform.split_handle %matmul : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- // Step 2. Tile the matmul and fuse the fill
- // ===========================================================================
- %grid_reduction, %forall_grid =
- transform.structured.tile_using_forall %matmul1 tile_sizes [16] ( mapping = [#gpu.block<x>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
+ // Step 2. Tile the matmul and fuse the fill
+ // ===========================================================================
+ %grid_reduction, %forall_grid =
+ transform.structured.tile_using_forall %matmul1 tile_sizes [16] ( mapping = [#gpu.block<x>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
- transform.structured.fuse_into_containing_op %fill1 into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.structured.fuse_into_containing_op %matmul0 into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.structured.fuse_into_containing_op %fill0 into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.structured.fuse_into_containing_op %fill1 into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.structured.fuse_into_containing_op %matmul0 into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.structured.fuse_into_containing_op %fill0 into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
- // Step 3. Vectorize
- // ===========================================================================
- %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %func {
- transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
- transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
- transform.apply_patterns.vector.cast_away_vector_leading_one_dim
- } : !transform.any_op
- %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
+ // Step 3. Vectorize
+ // ===========================================================================
+ %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %func {
+ transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+ transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+ transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+ } : !transform.any_op
+ %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
- // Step 4. Bufferize
- // ===========================================================================
- transform.apply_patterns to %func_3 {
- transform.apply_patterns.iree.fold_fill_into_pad
- transform.apply_patterns.linalg.tiling_canonicalization
- transform.apply_patterns.scf.for_loop_canonicalization
- } : !transform.any_op
- transform.apply_patterns to %func_3 {
- transform.apply_patterns.tensor.reassociative_reshape_folding
- transform.apply_patterns.canonicalization
- } : !transform.any_op
- transform.iree.apply_cse %func_3 : !transform.any_op
- transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
- transform.apply_patterns to %func_3 {
- transform.apply_patterns.linalg.erase_unnecessary_inputs
- } : !transform.any_op
- %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
- %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ // Step 4. Bufferize
+ // ===========================================================================
+ transform.apply_patterns to %func_3 {
+ transform.apply_patterns.iree.fold_fill_into_pad
+ transform.apply_patterns.linalg.tiling_canonicalization
+ transform.apply_patterns.scf.for_loop_canonicalization
+ } : !transform.any_op
+ transform.apply_patterns to %func_3 {
+ transform.apply_patterns.tensor.reassociative_reshape_folding
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.iree.apply_cse %func_3 : !transform.any_op
+ transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+ transform.apply_patterns to %func_3 {
+ transform.apply_patterns.linalg.erase_unnecessary_inputs
+ } : !transform.any_op
+ %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
+ %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- // Step 5. Pre-process the contract and transfer ops to put it in the right form.
- // ===========================================================================
- %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %func_2 {
- transform.apply_patterns.iree.prepare_vector_to_mma
- } : !transform.any_op
+ // Step 5. Pre-process the contract and transfer ops to put it in the right form.
+ // ===========================================================================
+ %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %func_2 {
+ transform.apply_patterns.iree.prepare_vector_to_mma
+ } : !transform.any_op
- // Step 6. Post-bufferization vector distribution
- // ===========================================================================
- %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
- transform.iree.map_nested_forall_to_gpu_threads %func_7 workgroup_dims = [4, 8, 1] : (!transform.any_op) -> ()
+ // Step 6. Post-bufferization vector distribution
+ // ===========================================================================
+ %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
+ transform.iree.map_nested_forall_to_gpu_threads %func_7 workgroup_dims = [4, 8, 1] : (!transform.any_op) -> ()
- // Step 7. Do layout analysis and lower to mma
- // ===========================================================================
- %func_10 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- %func_11 = transform.iree.layout_analysis_and_distribution %func_10 : (!transform.any_op) -> (!transform.any_op)
-}
+ // Step 7. Do layout analysis and lower to mma
+ // ===========================================================================
+ %func_10 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ %func_11 = transform.iree.layout_analysis_and_distribution %func_10 : (!transform.any_op) -> (!transform.any_op)
+
+ transform.yield
+ }
+} // module
diff --git a/tests/transform_dialect/cuda/mma.mlir b/tests/transform_dialect/cuda/mma.mlir
index 92d5cf8..2093e1c 100644
--- a/tests/transform_dialect/cuda/mma.mlir
+++ b/tests/transform_dialect/cuda/mma.mlir
@@ -27,21 +27,25 @@
return
}
-transform.sequence failures(propagate) {
-^bb1(%module: !transform.any_op):
- %func = transform.structured.match ops{["func.func"]} in %module
- : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %func {
- transform.apply_patterns.iree.unroll_vectors_gpu_wmma_sync
- } : !transform.any_op
- transform.iree.vector.vector_to_mma_conversion %func { use_wmma } : (!transform.any_op) -> ()
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(
+ %module: !transform.any_op {transform.readonly}) {
+ %func = transform.structured.match ops{["func.func"]} in %module
+ : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %func {
+ transform.apply_patterns.iree.unroll_vectors_gpu_wmma_sync
+ } : !transform.any_op
+ transform.iree.vector.vector_to_mma_conversion %func { use_wmma } : (!transform.any_op) -> ()
- // Apply canonicalization post-hoc to trigger DCE and pass the test
- // (i.e. all vector.contract are dead).
- // TODO: consider having the vector_to_mma_conversion do the DCE automatically.
- transform.apply_patterns to %func {
- transform.apply_patterns.canonicalization
- } : !transform.any_op
+ // Apply canonicalization post-hoc to trigger DCE and pass the test
+ // (i.e. all vector.contract are dead).
+ // TODO: consider having the vector_to_mma_conversion do the DCE automatically.
+ transform.apply_patterns to %func {
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+
+ transform.yield
+ }
}
// -----
@@ -71,20 +75,23 @@
return
}
-transform.sequence failures(propagate) {
-^bb1(%module: !transform.any_op):
- %func = transform.structured.match ops{["func.func"]} in %module
- : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %func {
- transform.apply_patterns.iree.unroll_vectors_gpu_mma_sync
- } : !transform.any_op
- transform.iree.vector.vector_to_mma_conversion %func { use_mma_sync } : (!transform.any_op) -> ()
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(
+ %module: !transform.any_op {transform.readonly}) {
+ %func = transform.structured.match ops{["func.func"]} in %module
+ : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %func {
+ transform.apply_patterns.iree.unroll_vectors_gpu_mma_sync
+ } : !transform.any_op
+ transform.iree.vector.vector_to_mma_conversion %func { use_mma_sync } : (!transform.any_op) -> ()
- // Apply canonicalization post-hoc to trigger DCE and pass the test
- // (i.e. all vector.contract are dead).
- // TODO: consider having the vector_to_mma_conversion do the DCE automatically.
- transform.apply_patterns to %func {
- transform.apply_patterns.canonicalization
- } : !transform.any_op
+ // Apply canonicalization post-hoc to trigger DCE and pass the test
+ // (i.e. all vector.contract are dead).
+ // TODO: consider having the vector_to_mma_conversion do the DCE automatically.
+ transform.apply_patterns to %func {
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+
+ transform.yield
+ }
}
-
diff --git a/tests/transform_dialect/cuda/mma_elemwise_layout_analysis.mlir b/tests/transform_dialect/cuda/mma_elemwise_layout_analysis.mlir
index e1cbe68..aaf5801 100644
--- a/tests/transform_dialect/cuda/mma_elemwise_layout_analysis.mlir
+++ b/tests/transform_dialect/cuda/mma_elemwise_layout_analysis.mlir
@@ -18,7 +18,7 @@
// RUN: iree-compile %s --iree-hal-target-backends=cuda \
// RUN: --iree-hal-cuda-llvm-target-arch=sm_80 \
// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN: --iree-codegen-use-transform-dialect-strategy=%p/mma_elemwise_layout_analysis_codegen_spec.mlir | \
+// RUN: --iree-codegen-transform-dialect-library=%p/mma_elemwise_layout_analysis_codegen_spec.mlir | \
// RUN: iree-run-module --module=- --function=matmul --device=cuda \
// RUN: --input="16x16xf16=[[0.0999755859375,0.2249755859375,0.07501220703125,0.0,0.07501220703125,0.2249755859375,0.175048828125,0.07501220703125,0.175048828125,0.07501220703125,0.024993896484375,0.1500244140625,0.1500244140625,0.2249755859375,0.199951171875,0.1500244140625],[0.1500244140625,0.199951171875,0.0999755859375,0.07501220703125,0.1500244140625,0.2249755859375,0.024993896484375,0.0999755859375,0.0999755859375,0.024993896484375,0.2249755859375,0.2249755859375,0.2249755859375,0.0,0.024993896484375,0.04998779296875],[0.07501220703125,0.0,0.125,0.125,0.04998779296875,0.2249755859375,0.024993896484375,0.199951171875,0.199951171875,0.07501220703125,0.1500244140625,0.2249755859375,0.024993896484375,0.175048828125,0.07501220703125,0.125],[0.04998779296875,0.024993896484375,0.0,0.2249755859375,0.07501220703125,0.024993896484375,0.024993896484375,0.0,0.07501220703125,0.1500244140625,0.1500244140625,0.175048828125,0.2249755859375,0.1500244140625,0.07501220703125,0.0999755859375],[0.125,0.0,0.199951171875,0.04998779296875,0.199951171875,0.04998779296875,0.175048828125,0.125,0.0,0.0,0.199951171875,0.024993896484375,0.2249755859375,0.1500244140625,0.024993896484375,0.0],[0.04998779296875,0.2249755859375,0.0999755859375,0.07501220703125,0.2249755859375,0.07501220703125,0.2249755859375,0.07501220703125,0.2249755859375,0.199951171875,0.125,0.07501220703125,0.04998779296875,0.199951171875,0.125,0.1500244140625],[0.1500244140625,0.125,0.175048828125,0.04998779296875,0.125,0.1500244140625,0.1500244140625,0.125,0.0999755859375,0.0,0.199951171875,0.024993896484375,0.175048828125,0.199951171875,0.125,0.0999755859375],[0.0999755859375,0.199951171875,0.0999755859375,0.0999755859375,0.2249755859375,0.0,0.175048828125,0.0999755859375,0.125,0.07501220703125,0.07501220703125,0.175048828125,0.07501220703125,0.0,0.2249755859375,0.2249755859375],[0.07501220703125,0.024993896484375,0.199951171875,0.024993896484375,0.175048828125,0.199951171875,0.0999755859375,0.024993896484375,0.0,0.0999755859375,0.0,0.0999755859375,0.2249755859375,0.175048828125,0.0,0.0],[0.024993896484375,0.0999755859375,0.2249755859375,0.2249755859375,0.125,0.2249755859375,0.04998779296875,0.04998779296875,0.04998779296875,0.024993896484375,0.0999755859375,0.2249755859375,0.024993896484375,0.024993896484375,0.0,0.07501220703125],[0.0,0.1500244140625,0.175048828125,0.1500244140625,0.2249755859375,0.024993896484375,0.1500244140625,0.0999755859375,0.024993896484375,0.0,0.125,0.04998779296875,0.125,0.199951171875,0.024993896484375,0.199951171875],[0.024993896484375,0.04998779296875,0.199951171875,0.0,0.07501220703125,0.199951171875,0.2249755859375,0.04998779296875,0.175048828125,0.0,0.199951171875,0.199951171875,0.1500244140625,0.199951171875,0.125,0.199951171875],[0.1500244140625,0.125,0.04998779296875,0.0999755859375,0.04998779296875,0.175048828125,0.04998779296875,0.0999755859375,0.2249755859375,0.199951171875,0.125,0.1500244140625,0.0999755859375,0.07501220703125,0.07501220703125,0.0999755859375],[0.0,0.04998779296875,0.125,0.024993896484375,0.04998779296875,0.199951171875,0.04998779296875,0.0999755859375,0.199951171875,0.07501220703125,0.1500244140625,0.125,0.199951171875,0.199951171875,0.0,0.125],[0.024993896484375,0.07501220703125,0.0,0.199951171875,0.024993896484375,0.024993896484375,0.024993896484375,0.175048828125,0.04998779296875,0.04998779296875,0.04998779296875,0.07501220703125,0.07501220703125,0.1500244140625,0.175048828125,0.199951171875],[0.0,0.125,0.0,0.07501220703125,0.125,0.125,0.07501220703125,0.1500244140625,0.04998779296875,0.04998779296875,0.125,0.125,0.2249755859375,0.0999755859375,0.07501220703125,0.07501220703125]]" \
// RUN: --input="8x16xf16=[[0.175049 0.0999756 0.0249939 0.224976 0.224976 0.199951 0.150024 0.0499878 0.224976 0.0249939 0.224976 0.150024 0.125 0.150024 0.125 0.125][0.0750122 0.175049 0.199951 0.0750122 0.224976 0.150024 0.125 0.175049 0.125 0.125 0.0249939 0.0249939 0.0999756 0.224976 0.0750122 0.0249939][0.199951 0.0750122 0 0.199951 0.125 0.0249939 0.0249939 0.125 0.224976 0 0.0499878 0 0 0.0499878 0.175049 0.0999756][0 0.0499878 0.150024 0.0999756 0.175049 0.224976 0.0750122 0.175049 0.150024 0.0249939 0 0.0999756 0.0999756 0.125 0.150024 0.175049][0.175049 0.125 0.175049 0.0999756 0 0.0249939 0.125 0.175049 0 0.175049 0 0.125 0.199951 0.150024 0.175049 0.0249939][0.125 0.125 0.0999756 0.224976 0.0750122 0.150024 0.125 0.0750122 0 0.175049 0.150024 0.150024 0.125 0 0 0][0.199951 0.0750122 0.175049 0.0999756 0.0499878 0.224976 0.0750122 0.0249939 0.150024 0.0249939 0.0750122 0.224976 0.175049 0 0.0499878 0.0249939][0.0499878 0.224976 0.150024 0.0999756 0 0.199951 0.150024 0.125 0.125 0.125 0.224976 0 0.175049 0.0999756 0.125 0]]" \
diff --git a/tests/transform_dialect/cuda/mma_elemwise_layout_analysis_codegen_spec.mlir b/tests/transform_dialect/cuda/mma_elemwise_layout_analysis_codegen_spec.mlir
index 3300cd0..e2d5780 100644
--- a/tests/transform_dialect/cuda/mma_elemwise_layout_analysis_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/mma_elemwise_layout_analysis_codegen_spec.mlir
@@ -1,61 +1,64 @@
// RUN: iree-opt %s
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(
+ %variant_op: !transform.any_op {transform.consumed}) {
+ // Step 1. Find the fill, matmul and generic ops
+ // ===========================================================================
+ %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ %matmul = transform.structured.match ops{["linalg.matmul_transpose_b"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ %generic = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- // Step 1. Find the fill, matmul and generic ops
- // ===========================================================================
- %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- %matmul = transform.structured.match ops{["linalg.matmul_transpose_b"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- %generic = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ // Step 2. Tile the generic and fuse the fill and matmul
+ // ===========================================================================
+ %grid_reduction, %forall_grid =
+ transform.structured.tile_using_forall %generic tile_sizes [16] ( mapping = [#gpu.block<x>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
- // Step 2. Tile the generic and fuse the fill and matmul
- // ===========================================================================
- %grid_reduction, %forall_grid =
- transform.structured.tile_using_forall %generic tile_sizes [16] ( mapping = [#gpu.block<x>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
+ transform.structured.fuse_into_containing_op %matmul into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.structured.fuse_into_containing_op %fill into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.structured.fuse_into_containing_op %matmul into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.structured.fuse_into_containing_op %fill into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ // Step 3. Vectorize
+ // ===========================================================================
+ %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %func {
+ transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+ transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+ transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+ } : !transform.any_op
+ %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
- // Step 3. Vectorize
- // ===========================================================================
- %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %func {
- transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
- transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
- transform.apply_patterns.vector.cast_away_vector_leading_one_dim
- } : !transform.any_op
- %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
+ // Step 4. Bufferize
+ // ===========================================================================
+ transform.apply_patterns to %func_3 {
+ transform.apply_patterns.iree.fold_fill_into_pad
+ transform.apply_patterns.linalg.tiling_canonicalization
+ transform.apply_patterns.scf.for_loop_canonicalization
+ } : !transform.any_op
+ transform.apply_patterns to %func_3 {
+ transform.apply_patterns.tensor.reassociative_reshape_folding
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.iree.apply_cse %func_3 : !transform.any_op
+ transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+ transform.apply_patterns to %func_3 {
+ transform.apply_patterns.linalg.erase_unnecessary_inputs
+ } : !transform.any_op
+ %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
+ %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- // Step 4. Bufferize
- // ===========================================================================
- transform.apply_patterns to %func_3 {
- transform.apply_patterns.iree.fold_fill_into_pad
- transform.apply_patterns.linalg.tiling_canonicalization
- transform.apply_patterns.scf.for_loop_canonicalization
- } : !transform.any_op
- transform.apply_patterns to %func_3 {
- transform.apply_patterns.tensor.reassociative_reshape_folding
- transform.apply_patterns.canonicalization
- } : !transform.any_op
- transform.iree.apply_cse %func_3 : !transform.any_op
- transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
- transform.apply_patterns to %func_3 {
- transform.apply_patterns.linalg.erase_unnecessary_inputs
- } : !transform.any_op
- %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
- %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ // Step 6. Post-bufferization vector distribution
+ // ===========================================================================
+ %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
+ transform.iree.map_nested_forall_to_gpu_threads %func_7
+ workgroup_dims = [4, 8, 1] : (!transform.any_op) -> ()
- // Step 6. Post-bufferization vector distribution
- // ===========================================================================
- %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
- transform.iree.map_nested_forall_to_gpu_threads %func_7
- workgroup_dims = [4, 8, 1] : (!transform.any_op) -> ()
+ // Step 7. Do layout analysis and lower to mma
+ // ===========================================================================
+ %func_10 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ %func_11 = transform.iree.layout_analysis_and_distribution %func_10 : (!transform.any_op) -> (!transform.any_op)
+ transform.yield
+ }
+} // module
- // Step 7. Do layout analysis and lower to mma
- // ===========================================================================
- %func_10 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- %func_11 = transform.iree.layout_analysis_and_distribution %func_10 : (!transform.any_op) -> (!transform.any_op)
-}
diff --git a/tests/transform_dialect/cuda/mma_reduction_layout_analysis.mlir b/tests/transform_dialect/cuda/mma_reduction_layout_analysis.mlir
index 627d8f2..a99b19d 100644
--- a/tests/transform_dialect/cuda/mma_reduction_layout_analysis.mlir
+++ b/tests/transform_dialect/cuda/mma_reduction_layout_analysis.mlir
@@ -27,7 +27,8 @@
// RUN: --iree-hal-cuda-llvm-target-arch=sm_80 \
// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
// RUN: --iree-flow-dispatch-use-transform-dialect=%p/mma_reduction_layout_analysis_dispatch_spec.mlir \
-// RUN: --iree-codegen-use-transform-dialect-strategy=%p/mma_reduction_layout_analysis_codegen_spec.mlir | \
+// RUN: --iree-codegen-transform-dialect-library=%p/mma_reduction_layout_analysis_codegen_spec.mlir \
+// RUN: --iree-codegen-use-transform-dialect-strategy=codegen | \
// RUN: iree-run-module --module=- --function=matmul_reduction --device=cuda \
// RUN: --input="16x16xf16=[[3.0,2.0,2.5,4.5,1.5,4.0,2.0,2.5,4.0,4.0,1.5,0.5,2.0,3.0,0.5,2.0],[2.5,2.5,0.5,3.5,0.0,2.5,3.5,1.0,0.5,0.0,3.0,4.5,0.5,0.5,0.0,3.5],[4.5,3.0,4.0,2.5,1.0,0.5,0.0,4.5,0.0,2.5,3.5,0.0,2.0,4.5,1.5,4.5],[0.0,2.0,1.5,0.0,2.0,1.5,3.0,2.0,2.0,4.0,4.0,2.5,0.0,3.0,2.0,0.5],[0.5,3.5,3.0,2.5,0.0,2.5,3.0,3.0,4.5,2.0,2.0,1.0,2.0,1.0,3.5,2.0],[0.0,4.5,2.0,4.0,2.5,2.5,1.5,1.5,1.5,3.0,3.0,0.0,2.5,0.5,2.0,2.0],[3.5,4.0,3.5,1.5,2.0,0.5,1.0,2.5,4.0,3.5,0.0,3.0,0.0,1.5,4.5,0.0],[4.5,3.5,1.0,4.5,0.5,0.0,1.5,4.5,1.5,3.5,3.0,2.5,0.0,0.5,0.0,4.0],[2.0,3.0,0.5,2.0,1.5,0.5,2.0,2.5,2.5,4.0,2.0,4.5,4.0,0.0,2.0,3.0],[2.5,4.0,4.0,3.0,2.0,2.0,4.5,0.5,4.5,1.0,2.0,0.0,4.5,1.0,3.0,0.5],[4.0,1.5,3.5,3.0,2.5,4.5,1.0,3.5,3.0,2.5,2.5,2.0,2.0,4.5,1.5,2.5],[3.0,3.0,0.0,2.5,1.0,3.0,0.0,1.5,1.5,2.5,0.5,1.0,3.0,3.5,1.5,1.5],[0.0,4.5,0.5,1.5,0.5,4.0,3.5,4.0,4.0,0.0,0.5,1.0,4.5,1.5,0.0,3.5],[2.5,2.0,2.5,1.5,3.0,0.0,2.0,1.0,2.5,4.0,0.0,4.0,4.0,1.5,3.0,2.5],[3.0,0.0,4.0,4.0,2.0,0.5,1.0,3.5,4.0,2.5,4.0,4.5,0.0,3.0,1.5,2.5],[0.5,0.5,2.5,4.0,1.0,2.5,0.5,4.5,2.0,3.0,1.5,4.5,1.5,4.5,0.5,1.5]]" \
// RUN: --input="16x16xf16=[[3.5,3.0,4.5,3.0,3.0,0.0,2.0,2.5,2.0,0.0,4.5,2.5,0.5,0.0,4.0,3.5],[0.0,0.5,2.0,4.5,0.0,4.0,1.5,3.5,0.5,2.5,3.5,1.5,3.5,4.5,4.0,3.0],[3.0,3.5,2.5,1.5,1.5,1.5,0.5,4.5,0.0,3.5,4.0,0.0,0.0,2.0,0.5,1.0],[1.5,4.0,3.5,3.5,0.0,0.0,0.0,2.0,3.0,1.5,0.0,3.0,0.0,2.5,2.0,3.0],[3.5,4.0,2.5,1.5,3.0,2.0,3.0,4.5,1.5,3.0,2.0,3.5,2.5,4.5,0.5,3.5],[0.0,0.0,0.0,0.5,1.0,2.5,1.5,1.0,2.5,1.5,0.0,1.5,1.5,2.0,4.5,2.5],[4.0,1.5,3.0,2.5,2.5,3.5,2.0,4.0,1.5,2.5,0.5,4.0,1.0,4.5,3.5,0.0],[1.0,2.0,4.0,4.5,4.5,3.5,0.0,1.0,4.5,3.5,2.0,3.0,0.5,4.0,3.5,1.5],[1.0,0.0,2.5,4.5,0.0,2.0,0.0,2.5,3.0,4.0,2.5,0.5,3.5,0.0,3.5,1.0],[0.0,3.5,4.0,0.0,0.0,4.5,1.0,3.5,1.5,3.0,2.0,1.0,0.5,0.5,2.0,0.0],[1.5,0.0,4.5,2.0,4.5,4.5,3.5,3.0,2.5,4.5,0.5,0.5,0.0,4.5,0.0,4.0],[4.5,3.5,4.0,4.0,1.5,4.0,1.0,4.0,2.5,0.5,4.5,3.5,3.5,0.5,4.5,3.0],[0.0,3.0,2.5,1.0,1.5,2.0,1.0,1.5,4.0,2.5,3.5,1.0,3.5,2.5,3.5,4.5],[1.5,4.5,2.0,2.0,2.0,0.5,4.0,2.0,4.0,3.5,4.0,1.0,1.5,2.5,1.0,0.0],[0.0,0.0,1.0,2.5,3.5,2.5,4.0,0.0,2.0,2.0,4.5,0.5,1.0,3.5,3.0,2.5],[2.0,2.0,0.5,2.0,4.5,2.5,3.0,1.5,4.5,2.0,3.5,3.0,1.0,2.0,1.5,2.0]]" |\
diff --git a/tests/transform_dialect/cuda/mma_reduction_layout_analysis_codegen_spec.mlir b/tests/transform_dialect/cuda/mma_reduction_layout_analysis_codegen_spec.mlir
index 479aa34..1f2ca62 100644
--- a/tests/transform_dialect/cuda/mma_reduction_layout_analysis_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/mma_reduction_layout_analysis_codegen_spec.mlir
@@ -1,61 +1,65 @@
// RUN: iree-opt %s
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @codegen(
+ %variant_op: !transform.any_op {transform.consumed}) {
- // Step 1. Find the fill, matmul and generic ops
- // ===========================================================================
- %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- %matmul = transform.structured.match ops{["linalg.matmul_transpose_b"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- %generics = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- %reduce, %broadcast = transform.split_handle %generics : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ // Step 1. Find the fill, matmul and generic ops
+ // ===========================================================================
+ %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ %matmul = transform.structured.match ops{["linalg.matmul_transpose_b"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ %generics = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ %reduce, %broadcast = transform.split_handle %generics : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- // Step 2. Tile the matmul and fuse the fill
- // ===========================================================================
- %grid_reduction, %forall_grid =
- transform.structured.tile_using_forall %broadcast tile_sizes [16] ( mapping = [#gpu.block<x>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
- transform.structured.fuse_into_containing_op %reduce into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.structured.fuse_into_containing_op %matmul into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.structured.fuse_into_containing_op %fill into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ // Step 2. Tile the matmul and fuse the fill
+ // ===========================================================================
+ %grid_reduction, %forall_grid =
+ transform.structured.tile_using_forall %broadcast tile_sizes [16] ( mapping = [#gpu.block<x>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
+ transform.structured.fuse_into_containing_op %reduce into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.structured.fuse_into_containing_op %matmul into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.structured.fuse_into_containing_op %fill into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
- // Step 3. Vectorize
- // ===========================================================================
- %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %func {
- transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
- transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
- transform.apply_patterns.vector.cast_away_vector_leading_one_dim
- } : !transform.any_op
- %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
+ // Step 3. Vectorize
+ // ===========================================================================
+ %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %func {
+ transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+ transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+ transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+ } : !transform.any_op
+ %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
- // Step 4. Bufferize
- // ===========================================================================
- transform.apply_patterns to %func_3 {
- transform.apply_patterns.iree.fold_fill_into_pad
- transform.apply_patterns.linalg.tiling_canonicalization
- transform.apply_patterns.scf.for_loop_canonicalization
- } : !transform.any_op
- transform.apply_patterns to %func_3 {
- transform.apply_patterns.tensor.reassociative_reshape_folding
- transform.apply_patterns.canonicalization
- } : !transform.any_op
- transform.iree.apply_cse %func_3 : !transform.any_op
- transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
- transform.apply_patterns to %func_3 {
- transform.apply_patterns.linalg.erase_unnecessary_inputs
- } : !transform.any_op
- %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
- %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ // Step 4. Bufferize
+ // ===========================================================================
+ transform.apply_patterns to %func_3 {
+ transform.apply_patterns.iree.fold_fill_into_pad
+ transform.apply_patterns.linalg.tiling_canonicalization
+ transform.apply_patterns.scf.for_loop_canonicalization
+ } : !transform.any_op
+ transform.apply_patterns to %func_3 {
+ transform.apply_patterns.tensor.reassociative_reshape_folding
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.iree.apply_cse %func_3 : !transform.any_op
+ transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+ transform.apply_patterns to %func_3 {
+ transform.apply_patterns.linalg.erase_unnecessary_inputs
+ } : !transform.any_op
+ %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
+ %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- // Step 6. Post-bufferization vector distribution
- // ===========================================================================
- %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
- transform.iree.map_nested_forall_to_gpu_threads %func_7 workgroup_dims = [4, 8, 1] : (!transform.any_op) -> ()
+ // Step 6. Post-bufferization vector distribution
+ // ===========================================================================
+ %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
+ transform.iree.map_nested_forall_to_gpu_threads %func_7 workgroup_dims = [4, 8, 1] : (!transform.any_op) -> ()
- // Step 7. Do layout analysis and lower to mma
- // ===========================================================================
- %func_10 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- %func_11 = transform.iree.layout_analysis_and_distribution %func_10 : (!transform.any_op) -> (!transform.any_op)
-}
+ // Step 7. Do layout analysis and lower to mma
+ // ===========================================================================
+ %func_10 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ %func_11 = transform.iree.layout_analysis_and_distribution %func_10 : (!transform.any_op) -> (!transform.any_op)
+
+ transform.yield
+ }
+} // module
diff --git a/tests/transform_dialect/cuda/mma_using_layout_analysis.mlir b/tests/transform_dialect/cuda/mma_using_layout_analysis.mlir
index d0ca10c..132b9ea 100644
--- a/tests/transform_dialect/cuda/mma_using_layout_analysis.mlir
+++ b/tests/transform_dialect/cuda/mma_using_layout_analysis.mlir
@@ -10,7 +10,7 @@
// RUN: iree-compile %s --iree-hal-target-backends=cuda \
// RUN: --iree-hal-cuda-llvm-target-arch=sm_80 \
// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN: --iree-codegen-use-transform-dialect-strategy=%p/mma_using_layout_analysis_codegen_spec.mlir | \
+// RUN: --iree-codegen-transform-dialect-library=%p/mma_using_layout_analysis_codegen_spec.mlir | \
// RUN: iree-run-module --module=- --function=matmul --device=cuda \
// RUN: --input="16x16xf16=[[1.0,1.125,1.25,1.375,1.5,1.625,1.75,1.875,2.0,2.125,2.25,2.375,2.5,2.625,2.75,2.875],[3.0,3.125,3.25,3.375,3.5,3.625,3.75,3.875,4.0,4.125,4.25,4.375,4.5,4.625,4.75,4.875],[5.0,5.125,5.25,5.375,5.5,5.625,5.75,5.875,6.0,6.125,6.25,6.375,6.5,6.625,6.75,6.875],[7.0,7.125,7.25,7.375,7.5,7.625,7.75,7.875,8.0,8.125,8.25,8.375,8.5,8.625,8.75,8.875],[9.0,9.125,9.25,9.375,9.5,9.625,9.75,9.875,10.0,10.125,10.25,10.375,10.5,10.625,10.75,10.875],[11.0,11.125,11.25,11.375,11.5,11.625,11.75,11.875,12.0,12.125,12.25,12.375,12.5,12.625,12.75,12.875],[13.0,13.125,13.25,13.375,13.5,13.625,13.75,13.875,14.0,14.125,14.25,14.375,14.5,14.625,14.75,14.875],[15.0,15.125,15.25,15.375,15.5,15.625,15.75,15.875,16.0,16.125,16.25,16.375,16.5,16.625,16.75,16.875],[17.0,17.125,17.25,17.375,17.5,17.625,17.75,17.875,18.0,18.125,18.25,18.375,18.5,18.625,18.75,18.875],[19.0,19.125,19.25,19.375,19.5,19.625,19.75,19.875,20.0,20.125,20.25,20.375,20.5,20.625,20.75,20.875],[21.0,21.125,21.25,21.375,21.5,21.625,21.75,21.875,22.0,22.125,22.25,22.375,22.5,22.625,22.75,22.875],[23.0,23.125,23.25,23.375,23.5,23.625,23.75,23.875,24.0,24.125,24.25,24.375,24.5,24.625,24.75,24.875],[25.0,25.125,25.25,25.375,25.5,25.625,25.75,25.875,26.0,26.125,26.25,26.375,26.5,26.625,26.75,26.875],[27.0,27.125,27.25,27.375,27.5,27.625,27.75,27.875,28.0,28.125,28.25,28.375,28.5,28.625,28.75,28.875],[29.0,29.125,29.25,29.375,29.5,29.625,29.75,29.875,30.0,30.125,30.25,30.375,30.5,30.625,30.75,30.875],[31.0,31.125,31.25,31.375,31.5,31.625,31.75,31.875,32.0,32.125,32.25,32.375,32.5,32.625,32.75,32.875]]" \
// RUN: --input="16x8xf16=[[1.0,1.125,1.25,1.375,1.5,1.625,1.75,1.875],[2.0,2.125,2.25,2.375,2.5,2.625,2.75,2.875],[3.0,3.125,3.25,3.375,3.5,3.625,3.75,3.875],[4.0,4.125,4.25,4.375,4.5,4.625,4.75,4.875],[5.0,5.125,5.25,5.375,5.5,5.625,5.75,5.875],[6.0,6.125,6.25,6.375,6.5,6.625,6.75,6.875],[7.0,7.125,7.25,7.375,7.5,7.625,7.75,7.875],[8.0,8.125,8.25,8.375,8.5,8.625,8.75,8.875],[9.0,9.125,9.25,9.375,9.5,9.625,9.75,9.875],[10.0,10.125,10.25,10.375,10.5,10.625,10.75,10.875],[11.0,11.125,11.25,11.375,11.5,11.625,11.75,11.875],[12.0,12.125,12.25,12.375,12.5,12.625,12.75,12.875],[13.0,13.125,13.25,13.375,13.5,13.625,13.75,13.875],[14.0,14.125,14.25,14.375,14.5,14.625,14.75,14.875],[15.0,15.125,15.25,15.375,15.5,15.625,15.75,15.875],[16.0,16.125,16.25,16.375,16.5,16.625,16.75,16.875]]" |\
diff --git a/tests/transform_dialect/cuda/mma_using_layout_analysis_codegen_spec.mlir b/tests/transform_dialect/cuda/mma_using_layout_analysis_codegen_spec.mlir
index 9d76c73..cea3833 100644
--- a/tests/transform_dialect/cuda/mma_using_layout_analysis_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/mma_using_layout_analysis_codegen_spec.mlir
@@ -1,73 +1,75 @@
// RUN: iree-opt %s
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(
+ %variant_op: !transform.any_op {transform.consumed}) {
+ // Step 1. Find the fill and matmul ops
+ // ===========================================================================
+ %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ %matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- // Step 1. Find the fill and matmul ops
- // ===========================================================================
- %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- %matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ // Step 2. Tile the matmul and fuse the fill
+ // ===========================================================================
+ %grid_reduction, %forall_grid =
+ transform.structured.tile_using_forall %matmul tile_sizes [16] ( mapping = [#gpu.block<x>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
- // Step 2. Tile the matmul and fuse the fill
- // ===========================================================================
- %grid_reduction, %forall_grid =
- transform.structured.tile_using_forall %matmul tile_sizes [16] ( mapping = [#gpu.block<x>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
+ transform.structured.fuse_into_containing_op %fill into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.structured.fuse_into_containing_op %fill into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
- // Promote operands in order to test loading from shared memory.
- %matmul_2 = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- %promoted_matmul, %alloc_0, %alloc_1 =
- transform.iree.promote_operands %matmul_2 [0, 1]
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+ // Promote operands in order to test loading from shared memory.
+ %matmul_2 = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ %promoted_matmul, %alloc_0, %alloc_1 =
+ transform.iree.promote_operands %matmul_2 [0, 1]
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
- // Step 3. Vectorize
- // ===========================================================================
- %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %func {
- transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
- transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
- transform.apply_patterns.vector.cast_away_vector_leading_one_dim
- } : !transform.any_op
- %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
+ // Step 3. Vectorize
+ // ===========================================================================
+ %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %func {
+ transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+ transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+ transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+ } : !transform.any_op
+ %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
- // Step 4. Bufferize
- // ===========================================================================
- transform.apply_patterns to %func_3 {
- transform.apply_patterns.iree.fold_fill_into_pad
- transform.apply_patterns.linalg.tiling_canonicalization
- transform.apply_patterns.scf.for_loop_canonicalization
- } : !transform.any_op
- transform.apply_patterns to %func_3 {
- transform.apply_patterns.tensor.reassociative_reshape_folding
- transform.apply_patterns.canonicalization
- } : !transform.any_op
- transform.iree.apply_cse %func_3 : !transform.any_op
- transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
- transform.apply_patterns to %func_3 {
- transform.apply_patterns.linalg.erase_unnecessary_inputs
- } : !transform.any_op
- %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
- %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ // Step 4. Bufferize
+ // ===========================================================================
+ transform.apply_patterns to %func_3 {
+ transform.apply_patterns.iree.fold_fill_into_pad
+ transform.apply_patterns.linalg.tiling_canonicalization
+ transform.apply_patterns.scf.for_loop_canonicalization
+ } : !transform.any_op
+ transform.apply_patterns to %func_3 {
+ transform.apply_patterns.tensor.reassociative_reshape_folding
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.iree.apply_cse %func_3 : !transform.any_op
+ transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+ transform.apply_patterns to %func_3 {
+ transform.apply_patterns.linalg.erase_unnecessary_inputs
+ } : !transform.any_op
+ %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
+ %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- // Step 5. Pre-process the contract and transfer ops to put it in the right form.
- // ===========================================================================
- %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %func_2 {
- transform.apply_patterns.iree.prepare_vector_to_mma
- } : !transform.any_op
+ // Step 5. Pre-process the contract and transfer ops to put it in the right form.
+ // ===========================================================================
+ %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %func_2 {
+ transform.apply_patterns.iree.prepare_vector_to_mma
+ } : !transform.any_op
- // Step 6. Post-bufferization vector distribution
- // ===========================================================================
- %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
- transform.iree.map_nested_forall_to_gpu_threads %func_7
- workgroup_dims = [4, 8, 1] : (!transform.any_op) -> ()
+ // Step 6. Post-bufferization vector distribution
+ // ===========================================================================
+ %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
+ transform.iree.map_nested_forall_to_gpu_threads %func_7
+ workgroup_dims = [4, 8, 1] : (!transform.any_op) -> ()
- // Step 7. Do layout analysis and lower to mma
- // ===========================================================================
- %func_10 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- %func_11 = transform.iree.layout_analysis_and_distribution %func_10 : (!transform.any_op) -> (!transform.any_op)
-}
+ // Step 7. Do layout analysis and lower to mma
+ // ===========================================================================
+ %func_10 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ %func_11 = transform.iree.layout_analysis_and_distribution %func_10 : (!transform.any_op) -> (!transform.any_op)
+ transform.yield
+ }
+} // module
diff --git a/tests/transform_dialect/cuda/reduction.mlir b/tests/transform_dialect/cuda/reduction.mlir
index e814353..500eba6 100644
--- a/tests/transform_dialect/cuda/reduction.mlir
+++ b/tests/transform_dialect/cuda/reduction.mlir
@@ -25,7 +25,8 @@
// RUN: --iree-hal-configuration-pipeline | \
// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-lower-executable-target)))' \
// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN: --iree-codegen-use-transform-dialect-strategy=%p/reduction_codegen_spec.mlir | \
+// RUN: --iree-codegen-transform-dialect-library=%p/reduction_codegen_spec.mlir \
+// RUN: --iree-codegen-use-transform-dialect-strategy=codegen | \
// RUN: FileCheck %s --check-prefix=CHECK
// RUN: iree-compile %s --iree-hal-target-backends=cuda \
@@ -33,7 +34,8 @@
/// Constant JIT'ing must be disabled because the transform-dialect debug
/// flags leak to the JIT session, which doesn't know what to do with them.
// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN: --iree-codegen-use-transform-dialect-strategy=%p/reduction_codegen_spec.mlir | \
+// RUN: --iree-codegen-transform-dialect-library=%p/reduction_codegen_spec.mlir \
+// RUN: --iree-codegen-use-transform-dialect-strategy=codegen | \
// RUN: iree-run-module --module=- --function=reduce --device=cuda --input="8x64xf32=1" |\
// RUN: FileCheck %s --check-prefix=EXEC
diff --git a/tests/transform_dialect/cuda/reduction_codegen_spec.mlir b/tests/transform_dialect/cuda/reduction_codegen_spec.mlir
index 4fc6a49..0ab6d6a 100644
--- a/tests/transform_dialect/cuda/reduction_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/reduction_codegen_spec.mlir
@@ -1,115 +1,120 @@
// RUN: iree-opt %s
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
- %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @codegen(
+ %variant_op: !transform.any_op {transform.consumed}) {
- // Step 1. Split the reduction to get meatier (size(red) / 2)-way parallelism.
- // ===========================================================================
- %0 = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- %init_or_alloc_op, %more_parallel_fill_op, %more_parallel_op, %combiner_op =
- transform.structured.split_reduction %0
- { split_factor = 2, insert_split_dimension = 1 }
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+ %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- // Step 2. First level of tiling + fusion parallelizes to blocks.
- // ===========================================================================
- %grid_combiner_op, %forall_grid =
- transform.structured.tile_using_forall %combiner_op tile_sizes [1]
- ( mapping = [#gpu.block<x>] )
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
- %not_combiner = transform.merge_handles %fill, %more_parallel_fill_op, %more_parallel_op : !transform.any_op
- transform.structured.fuse_into_containing_op %not_combiner into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ // Step 1. Split the reduction to get meatier (size(red) / 2)-way parallelism.
+ // ===========================================================================
+ %0 = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ %init_or_alloc_op, %more_parallel_fill_op, %more_parallel_op, %combiner_op =
+ transform.structured.split_reduction %0
+ { split_factor = 2, insert_split_dimension = 1 }
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
- // Step 3. Second level of tiling + fusion parallelizes to threads.
- // ===========================================================================
- %fill_1d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1xf32> in %variant_op
- : (!transform.any_op) -> !transform.any_op
- %block_combiner_op, %forall_block_combiner_op =
- transform.structured.tile_using_forall %grid_combiner_op tile_sizes [1]
- ( mapping = [#gpu.thread<z>] )
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.structured.fuse_into_containing_op %fill_1d into %forall_block_combiner_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ // Step 2. First level of tiling + fusion parallelizes to blocks.
+ // ===========================================================================
+ %grid_combiner_op, %forall_grid =
+ transform.structured.tile_using_forall %combiner_op tile_sizes [1]
+ ( mapping = [#gpu.block<x>] )
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
+ %not_combiner = transform.merge_handles %fill, %more_parallel_fill_op, %more_parallel_op : !transform.any_op
+ transform.structured.fuse_into_containing_op %not_combiner into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
- // Canonicalizations.
- %func_op = transform.structured.match ops{["func.func"]} in %variant_op
+ // Step 3. Second level of tiling + fusion parallelizes to threads.
+ // ===========================================================================
+ %fill_1d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1xf32> in %variant_op
: (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %func_op {
- transform.apply_patterns.iree.fold_fill_into_pad
- transform.apply_patterns.linalg.tiling_canonicalization
- transform.apply_patterns.scf.for_loop_canonicalization
- } : !transform.any_op
- transform.iree.apply_licm %func_op : !transform.any_op
- transform.iree.apply_cse %func_op : !transform.any_op
+ %block_combiner_op, %forall_block_combiner_op =
+ transform.structured.tile_using_forall %grid_combiner_op tile_sizes [1]
+ ( mapping = [#gpu.thread<z>] )
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.structured.fuse_into_containing_op %fill_1d into %forall_block_combiner_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
- %fill_2d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1x2xf32> in %variant_op
- : (!transform.any_op) -> !transform.any_op
- %grid_more_parallel_op = transform.structured.match ops{["linalg.generic"]}
- attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op
+ // Canonicalizations.
+ %func_op = transform.structured.match ops{["func.func"]} in %variant_op
+ : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %func_op {
+ transform.apply_patterns.iree.fold_fill_into_pad
+ transform.apply_patterns.linalg.tiling_canonicalization
+ transform.apply_patterns.scf.for_loop_canonicalization
+ } : !transform.any_op
+ transform.iree.apply_licm %func_op : !transform.any_op
+ transform.iree.apply_cse %func_op : !transform.any_op
+
+ %fill_2d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1x2xf32> in %variant_op
: (!transform.any_op) -> !transform.any_op
- %block_more_parallel_op, %forall_block_more_parallel_op =
- transform.structured.tile_using_forall %grid_more_parallel_op tile_sizes [1, 1]
- ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.structured.fuse_into_containing_op %fill_2d into %forall_block_more_parallel_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ %grid_more_parallel_op = transform.structured.match ops{["linalg.generic"]}
+ attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op
+ : (!transform.any_op) -> !transform.any_op
+ %block_more_parallel_op, %forall_block_more_parallel_op =
+ transform.structured.tile_using_forall %grid_more_parallel_op tile_sizes [1, 1]
+ ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.structured.fuse_into_containing_op %fill_2d into %forall_block_more_parallel_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
- // Step 4. Rank-reduce and vectorize.
- // ===========================================================================
- %func = transform.structured.match ops{["func.func"]} in %variant_op
- : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %func {
- transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
- transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
- transform.apply_patterns.vector.cast_away_vector_leading_one_dim
- } : !transform.any_op
- %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
+ // Step 4. Rank-reduce and vectorize.
+ // ===========================================================================
+ %func = transform.structured.match ops{["func.func"]} in %variant_op
+ : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %func {
+ transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+ transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+ transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+ } : !transform.any_op
+ %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
- // Step 5. Bufferize and drop HAL decriptor from memref ops.
- // ===========================================================================
- transform.apply_patterns to %func_3 {
- transform.apply_patterns.tensor.reassociative_reshape_folding
- } : !transform.any_op
- transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
- %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> !transform.any_op
- %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3
- : (!transform.any_op) -> !transform.any_op
+ // Step 5. Bufferize and drop HAL decriptor from memref ops.
+ // ===========================================================================
+ transform.apply_patterns to %func_3 {
+ transform.apply_patterns.tensor.reassociative_reshape_folding
+ } : !transform.any_op
+ transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+ %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> !transform.any_op
+ %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3
+ : (!transform.any_op) -> !transform.any_op
- // Step 6. Post-bufferization mapping to blocks and threads.
- // ===========================================================================
- %func_5 = transform.structured.match ops{["func.func"]} in %variant_op_3
- : (!transform.any_op) -> !transform.any_op
- transform.iree.forall_to_workgroup %func_5 : (!transform.any_op) -> ()
- transform.iree.map_nested_forall_to_gpu_threads %func_5
- workgroup_dims = [32, 2, 1] : (!transform.any_op) -> ()
+ // Step 6. Post-bufferization mapping to blocks and threads.
+ // ===========================================================================
+ %func_5 = transform.structured.match ops{["func.func"]} in %variant_op_3
+ : (!transform.any_op) -> !transform.any_op
+ transform.iree.forall_to_workgroup %func_5 : (!transform.any_op) -> ()
+ transform.iree.map_nested_forall_to_gpu_threads %func_5
+ workgroup_dims = [32, 2, 1] : (!transform.any_op) -> ()
- // Step 7. Post-bufferization vector distribution with rank-reduction.
- // ===========================================================================
- transform.apply_patterns to %func_5 {
- transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
- transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
- transform.apply_patterns.memref.fold_memref_alias_ops
- transform.apply_patterns.vector.cast_away_vector_leading_one_dim
- } : !transform.any_op
- %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3
- : (!transform.any_op) -> !transform.any_op
- // Don't complain about unsupported if (threadIdx.x == 0 && threadIdx.y == 0)
- // at this point.
- transform.sequence %variant_op_3 : !transform.any_op failures(suppress) {
- ^bb0(%arg0: !transform.any_op):
- transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 } : (!transform.any_op) -> !transform.any_op
+ // Step 7. Post-bufferization vector distribution with rank-reduction.
+ // ===========================================================================
+ transform.apply_patterns to %func_5 {
+ transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+ transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+ transform.apply_patterns.memref.fold_memref_alias_ops
+ transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+ } : !transform.any_op
+ %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3
+ : (!transform.any_op) -> !transform.any_op
+ // Don't complain about unsupported if (threadIdx.x == 0 && threadIdx.y == 0)
+ // at this point.
+ transform.sequence %variant_op_3 : !transform.any_op failures(suppress) {
+ ^bb0(%arg0: !transform.any_op):
+ transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 } : (!transform.any_op) -> !transform.any_op
+ }
+ transform.iree.vector.warp_distribute %func_5 : (!transform.any_op) -> ()
+
+
+ // Late Canonicalizations.
+ %func_op_3 = transform.structured.match ops{["func.func"]} in %variant_op_3
+ : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %func_op_3 {
+ transform.apply_patterns.iree.fold_fill_into_pad
+ transform.apply_patterns.linalg.tiling_canonicalization
+ transform.apply_patterns.scf.for_loop_canonicalization
+ } : !transform.any_op
+ transform.iree.apply_licm %func_op_3 : !transform.any_op
+ transform.iree.apply_cse %func_op_3 : !transform.any_op
+
+ transform.yield
}
- transform.iree.vector.warp_distribute %func_5 : (!transform.any_op) -> ()
-
-
- // Late Canonicalizations.
- %func_op_3 = transform.structured.match ops{["func.func"]} in %variant_op_3
- : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %func_op_3 {
- transform.apply_patterns.iree.fold_fill_into_pad
- transform.apply_patterns.linalg.tiling_canonicalization
- transform.apply_patterns.scf.for_loop_canonicalization
- } : !transform.any_op
- transform.iree.apply_licm %func_op_3 : !transform.any_op
- transform.iree.apply_cse %func_op_3 : !transform.any_op
-}
+} // module
diff --git a/tests/transform_dialect/cuda/reduction_eltwise.mlir b/tests/transform_dialect/cuda/reduction_eltwise.mlir
index 5e75858..a266998 100644
--- a/tests/transform_dialect/cuda/reduction_eltwise.mlir
+++ b/tests/transform_dialect/cuda/reduction_eltwise.mlir
@@ -36,12 +36,14 @@
// RUN: --iree-hal-configuration-pipeline | \
// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-lower-executable-target)))' \
// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN: --iree-codegen-use-transform-dialect-strategy=%p/reduction_eltwise_codegen_spec.mlir | \
+// RUN: --iree-codegen-transform-dialect-library=%p/reduction_eltwise_codegen_spec.mlir \
+// RUN: --iree-codegen-use-transform-dialect-strategy=codegen | \
// RUN: FileCheck %s --check-prefix=CHECK
// RUN: iree-compile %s --iree-hal-target-backends=cuda \
// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN: --iree-codegen-use-transform-dialect-strategy=%p/reduction_eltwise_codegen_spec.mlir | \
+// RUN: --iree-codegen-transform-dialect-library=%p/reduction_eltwise_codegen_spec.mlir \
+// RUN: --iree-codegen-use-transform-dialect-strategy=codegen | \
// RUN: iree-run-module --module=- --function=reduce --device=cuda --input="8x64xf32=1" |\
// RUN: FileCheck %s --check-prefix=EXEC
diff --git a/tests/transform_dialect/cuda/reduction_eltwise_codegen_spec.mlir b/tests/transform_dialect/cuda/reduction_eltwise_codegen_spec.mlir
index 4a95480..fb4bcd6 100644
--- a/tests/transform_dialect/cuda/reduction_eltwise_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/reduction_eltwise_codegen_spec.mlir
@@ -1,155 +1,160 @@
// RUN: iree-opt %s
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
- %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op
- : (!transform.any_op) -> !transform.any_op
-
- // Step 1. Split the reduction to get meatier (size(red) / 2)-way parallelism.
- // ===========================================================================
- %0 = transform.structured.match ops{["linalg.generic"]} in %variant_op
- : (!transform.any_op) -> !transform.any_op
- %reduction, %eltwise = transform.split_handle %0
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- %init_or_alloc_op, %more_parallel_fill_op, %more_parallel_op, %combiner_op =
- transform.structured.split_reduction %reduction
- { split_factor = 2, insert_split_dimension = 1 }
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-
- // Canonicalizations.
- %func_op = transform.structured.match ops{["func.func"]} in %variant_op
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @codegen(
+ %variant_op: !transform.any_op {transform.consumed}) {
+ %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op
: (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %func_op {
- transform.apply_patterns.iree.fold_fill_into_pad
- transform.apply_patterns.linalg.tiling_canonicalization
- transform.apply_patterns.scf.for_loop_canonicalization
- transform.apply_patterns.canonicalization
- } : !transform.any_op
- transform.iree.apply_licm %func_op : !transform.any_op
- transform.iree.apply_cse %func_op : !transform.any_op
- // Step 2. First level of tiling + fusion parallelizes to blocks. Tile the
- // trailing elementwise the same way we want to tile the reduction.
- // ===========================================================================
- %eltwise_grid_op, %grid_loop = transform.structured.tile_using_forall %eltwise
- tile_sizes [1] (mapping = [#gpu.block<x>])
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.iree.populate_workgroup_count_region_using_num_threads_slice %grid_loop : (!transform.any_op) -> ()
- %not_eltwise = transform.merge_handles %fill, %more_parallel_fill_op, %more_parallel_op, %combiner_op
- : !transform.any_op
- transform.structured.fuse_into_containing_op %not_eltwise into %grid_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
- // Canonicalizations.
- transform.apply_patterns to %func_op {
- transform.apply_patterns.iree.fold_fill_into_pad
- transform.apply_patterns.linalg.tiling_canonicalization
- transform.apply_patterns.scf.for_loop_canonicalization
- transform.apply_patterns.canonicalization
- } : !transform.any_op
- transform.iree.apply_licm %func_op : !transform.any_op
- transform.iree.apply_cse %func_op : !transform.any_op
-
- // Step 3. Second level of tiling + fusion parallelizes to threads.
- // ===========================================================================
- %fill_1d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1xf32> in %variant_op
- : (!transform.any_op) -> !transform.any_op
- %eltwise_block_op, %eltwise_block_loop =
- transform.structured.tile_using_forall %eltwise_grid_op tile_sizes [1]
- ( mapping = [#gpu.thread<z>] )
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- %block_combiner_op = transform.structured.match ops{["linalg.generic"]}
- attributes {iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op
+ // Step 1. Split the reduction to get meatier (size(red) / 2)-way parallelism.
+ // ===========================================================================
+ %0 = transform.structured.match ops{["linalg.generic"]} in %variant_op
: (!transform.any_op) -> !transform.any_op
- %combined_and_fill = transform.merge_handles %fill_1d, %block_combiner_op : !transform.any_op
- transform.structured.fuse_into_containing_op %combined_and_fill into %eltwise_block_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ %reduction, %eltwise = transform.split_handle %0
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ %init_or_alloc_op, %more_parallel_fill_op, %more_parallel_op, %combiner_op =
+ transform.structured.split_reduction %reduction
+ { split_factor = 2, insert_split_dimension = 1 }
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
- // Canonicalizations.
- transform.apply_patterns to %func_op {
- transform.apply_patterns.iree.fold_fill_into_pad
- transform.apply_patterns.linalg.tiling_canonicalization
- transform.apply_patterns.scf.for_loop_canonicalization
- transform.apply_patterns.canonicalization
- } : !transform.any_op
- transform.iree.apply_licm %func_op : !transform.any_op
- transform.iree.apply_cse %func_op : !transform.any_op
+ // Canonicalizations.
+ %func_op = transform.structured.match ops{["func.func"]} in %variant_op
+ : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %func_op {
+ transform.apply_patterns.iree.fold_fill_into_pad
+ transform.apply_patterns.linalg.tiling_canonicalization
+ transform.apply_patterns.scf.for_loop_canonicalization
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.iree.apply_licm %func_op : !transform.any_op
+ transform.iree.apply_cse %func_op : !transform.any_op
- %fill_2d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1x2xf32> in %variant_op
- : (!transform.any_op) -> !transform.any_op
- %grid_more_parallel_op = transform.structured.match ops{["linalg.generic"]}
- attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op
+ // Step 2. First level of tiling + fusion parallelizes to blocks. Tile the
+ // trailing elementwise the same way we want to tile the reduction.
+ // ===========================================================================
+ %eltwise_grid_op, %grid_loop = transform.structured.tile_using_forall %eltwise
+ tile_sizes [1] (mapping = [#gpu.block<x>])
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.iree.populate_workgroup_count_region_using_num_threads_slice %grid_loop : (!transform.any_op) -> ()
+ %not_eltwise = transform.merge_handles %fill, %more_parallel_fill_op, %more_parallel_op, %combiner_op
+ : !transform.any_op
+ transform.structured.fuse_into_containing_op %not_eltwise into %grid_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+ // Canonicalizations.
+ transform.apply_patterns to %func_op {
+ transform.apply_patterns.iree.fold_fill_into_pad
+ transform.apply_patterns.linalg.tiling_canonicalization
+ transform.apply_patterns.scf.for_loop_canonicalization
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.iree.apply_licm %func_op : !transform.any_op
+ transform.iree.apply_cse %func_op : !transform.any_op
+
+ // Step 3. Second level of tiling + fusion parallelizes to threads.
+ // ===========================================================================
+ %fill_1d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1xf32> in %variant_op
: (!transform.any_op) -> !transform.any_op
- %block_more_parallel_op, %forall_block_more_parallel_op =
- transform.structured.tile_using_forall %grid_more_parallel_op tile_sizes [1, 1]
- ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.structured.fuse_into_containing_op %fill_2d into %forall_block_more_parallel_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ %eltwise_block_op, %eltwise_block_loop =
+ transform.structured.tile_using_forall %eltwise_grid_op tile_sizes [1]
+ ( mapping = [#gpu.thread<z>] )
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ %block_combiner_op = transform.structured.match ops{["linalg.generic"]}
+ attributes {iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op
+ : (!transform.any_op) -> !transform.any_op
+ %combined_and_fill = transform.merge_handles %fill_1d, %block_combiner_op : !transform.any_op
+ transform.structured.fuse_into_containing_op %combined_and_fill into %eltwise_block_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
- // Canonicalizations.
- transform.apply_patterns to %func_op {
- transform.apply_patterns.iree.fold_fill_into_pad
- transform.apply_patterns.linalg.tiling_canonicalization
- transform.apply_patterns.scf.for_loop_canonicalization
- transform.apply_patterns.canonicalization
- } : !transform.any_op
- transform.iree.apply_licm %func_op : !transform.any_op
- transform.iree.apply_cse %func_op : !transform.any_op
+ // Canonicalizations.
+ transform.apply_patterns to %func_op {
+ transform.apply_patterns.iree.fold_fill_into_pad
+ transform.apply_patterns.linalg.tiling_canonicalization
+ transform.apply_patterns.scf.for_loop_canonicalization
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.iree.apply_licm %func_op : !transform.any_op
+ transform.iree.apply_cse %func_op : !transform.any_op
- // Step 4. Rank-reduce and vectorize.
- // ===========================================================================
- %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %func {
- transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
- transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
- transform.apply_patterns.vector.cast_away_vector_leading_one_dim
- } : !transform.any_op
- %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
+ %fill_2d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1x2xf32> in %variant_op
+ : (!transform.any_op) -> !transform.any_op
+ %grid_more_parallel_op = transform.structured.match ops{["linalg.generic"]}
+ attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op
+ : (!transform.any_op) -> !transform.any_op
+ %block_more_parallel_op, %forall_block_more_parallel_op =
+ transform.structured.tile_using_forall %grid_more_parallel_op tile_sizes [1, 1]
+ ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.structured.fuse_into_containing_op %fill_2d into %forall_block_more_parallel_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
- // Step 5. Bufferize and drop HAL decriptor from memref ops.
- // ===========================================================================
- transform.apply_patterns to %func_3 {
- transform.apply_patterns.tensor.reassociative_reshape_folding
- } : !transform.any_op
- transform.iree.eliminate_empty_tensors %variant_op: (!transform.any_op) -> ()
- %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
- %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3
- : (!transform.any_op) -> !transform.any_op
+ // Canonicalizations.
+ transform.apply_patterns to %func_op {
+ transform.apply_patterns.iree.fold_fill_into_pad
+ transform.apply_patterns.linalg.tiling_canonicalization
+ transform.apply_patterns.scf.for_loop_canonicalization
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.iree.apply_licm %func_op : !transform.any_op
+ transform.iree.apply_cse %func_op : !transform.any_op
- // Step 6. Post-bufferization mapping to blocks and threads.
- // ===========================================================================
- %func_5 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- transform.iree.forall_to_workgroup %func_5 : (!transform.any_op) -> ()
- transform.iree.map_nested_forall_to_gpu_threads %func_5
- workgroup_dims = [32, 2, 1] : (!transform.any_op) -> ()
+ // Step 4. Rank-reduce and vectorize.
+ // ===========================================================================
+ %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %func {
+ transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+ transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+ transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+ } : !transform.any_op
+ %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
- // Step 7. Post-bufferization vector distribution with rank-reduction.
- // ===========================================================================
- transform.apply_patterns to %func_5 {
- transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
- transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
- transform.apply_patterns.memref.fold_memref_alias_ops
- transform.apply_patterns.vector.cast_away_vector_leading_one_dim
- } : !transform.any_op
- %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3
- : (!transform.any_op) -> !transform.any_op
- // Don't complain about unsupported if (threadIdx.x == 0 && threadIdx.y == 0)
- // at this point.
- transform.sequence %variant_op_3 : !transform.any_op failures(suppress) {
- ^bb0(%arg0: !transform.any_op):
- transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 }
- : (!transform.any_op) -> !transform.any_op
+ // Step 5. Bufferize and drop HAL decriptor from memref ops.
+ // ===========================================================================
+ transform.apply_patterns to %func_3 {
+ transform.apply_patterns.tensor.reassociative_reshape_folding
+ } : !transform.any_op
+ transform.iree.eliminate_empty_tensors %variant_op: (!transform.any_op) -> ()
+ %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
+ %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3
+ : (!transform.any_op) -> !transform.any_op
+
+ // Step 6. Post-bufferization mapping to blocks and threads.
+ // ===========================================================================
+ %func_5 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ transform.iree.forall_to_workgroup %func_5 : (!transform.any_op) -> ()
+ transform.iree.map_nested_forall_to_gpu_threads %func_5
+ workgroup_dims = [32, 2, 1] : (!transform.any_op) -> ()
+
+ // Step 7. Post-bufferization vector distribution with rank-reduction.
+ // ===========================================================================
+ transform.apply_patterns to %func_5 {
+ transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+ transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+ transform.apply_patterns.memref.fold_memref_alias_ops
+ transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+ } : !transform.any_op
+ %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3
+ : (!transform.any_op) -> !transform.any_op
+ // Don't complain about unsupported if (threadIdx.x == 0 && threadIdx.y == 0)
+ // at this point.
+ transform.sequence %variant_op_3 : !transform.any_op failures(suppress) {
+ ^bb0(%arg0: !transform.any_op):
+ transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 }
+ : (!transform.any_op) -> !transform.any_op
+ }
+ transform.iree.vector.warp_distribute %func_5 : (!transform.any_op) -> ()
+
+
+ // Late canonicalizations.
+ %func_op_3 = transform.structured.match ops{["func.func"]} in %variant_op_3
+ : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %func_op_3 {
+ transform.apply_patterns.iree.fold_fill_into_pad
+ transform.apply_patterns.linalg.tiling_canonicalization
+ transform.apply_patterns.scf.for_loop_canonicalization
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.iree.apply_licm %func_op_3 : !transform.any_op
+ transform.iree.apply_cse %func_op_3 : !transform.any_op
+
+ transform.yield
}
- transform.iree.vector.warp_distribute %func_5 : (!transform.any_op) -> ()
+} // module
-
- // Late canonicalizations.
- %func_op_3 = transform.structured.match ops{["func.func"]} in %variant_op_3
- : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %func_op_3 {
- transform.apply_patterns.iree.fold_fill_into_pad
- transform.apply_patterns.linalg.tiling_canonicalization
- transform.apply_patterns.scf.for_loop_canonicalization
- transform.apply_patterns.canonicalization
- } : !transform.any_op
- transform.iree.apply_licm %func_op_3 : !transform.any_op
- transform.iree.apply_cse %func_op_3 : !transform.any_op
-}
diff --git a/tests/transform_dialect/cuda/reduction_v2.mlir b/tests/transform_dialect/cuda/reduction_v2.mlir
index de96cc1..6ff6442 100644
--- a/tests/transform_dialect/cuda/reduction_v2.mlir
+++ b/tests/transform_dialect/cuda/reduction_v2.mlir
@@ -25,12 +25,14 @@
// RUN: --iree-hal-configuration-pipeline | \
// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-lower-executable-target)))' \
// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN: --iree-codegen-use-transform-dialect-strategy=%p/reduction_v2_codegen_spec.mlir | \
+// RUN: --iree-codegen-transform-dialect-library=%p/reduction_v2_codegen_spec.mlir \
+// RUN: --iree-codegen-use-transform-dialect-strategy=codegen | \
// RUN: FileCheck %s --check-prefix=CHECK
// RUN: iree-compile %s --iree-hal-target-backends=cuda \
// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN: --iree-codegen-use-transform-dialect-strategy=%p/reduction_v2_codegen_spec.mlir | \
+// RUN: --iree-codegen-transform-dialect-library=%p/reduction_v2_codegen_spec.mlir \
+// RUN: --iree-codegen-use-transform-dialect-strategy=codegen | \
// RUN: iree-run-module --module=- --function=reduce --device=cuda --input="33x1024xf32=1" |\
// RUN: FileCheck %s --check-prefix=EXEC
diff --git a/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir b/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir
index f7a186d..b1479a0 100644
--- a/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir
@@ -1,103 +1,108 @@
// RUN: iree-opt %s
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
- %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- %reduction = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @codegen(
+ %variant_op: !transform.any_op {transform.consumed}) {
- // Step 1. First level of tiling + fusion parallelizes to blocks.
- // ===========================================================================
- %grid_reduction, %forall_grid =
- transform.structured.tile_using_forall %reduction tile_sizes [1]
- ( mapping = [#gpu.block<x>] )
+ %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ %reduction = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+
+ // Step 1. First level of tiling + fusion parallelizes to blocks.
+ // ===========================================================================
+ %grid_reduction, %forall_grid =
+ transform.structured.tile_using_forall %reduction tile_sizes [1]
+ ( mapping = [#gpu.block<x>] )
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
+ transform.structured.fuse_into_containing_op %fill into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+ // Step 2. Split the reduction to get meatier parallelism.
+ // ===========================================================================
+ %block_more_parallel_fill_op_2, %block_more_parallel_op_2, %block_combiner_op_2, %forall =
+ transform.structured.tile_reduction_using_for %grid_reduction by tile_sizes = [0, 128]
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+ %_1:2 =
+ transform.structured.tile_using_forall %block_more_parallel_op_2 num_threads [0, 32]
+ ( mapping = [#gpu.thread<x>] )
: (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
- transform.structured.fuse_into_containing_op %fill into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
- // Step 2. Split the reduction to get meatier parallelism.
- // ===========================================================================
- %block_more_parallel_fill_op_2, %block_more_parallel_op_2, %block_combiner_op_2, %forall =
- transform.structured.tile_reduction_using_for %grid_reduction by tile_sizes = [0, 128]
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
- %_1:2 =
- transform.structured.tile_using_forall %block_more_parallel_op_2 num_threads [0, 32]
- ( mapping = [#gpu.thread<x>] )
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ // Step 3. Second level of tiling parallelizes to threads.
+ // ===========================================================================
+ // 1st op is [parallel, parallel], map it to threadIdx.x by 4.
+ %_2:2 =
+ transform.structured.tile_using_forall %block_more_parallel_fill_op_2 tile_sizes [0, 4]
+ ( mapping = [#gpu.thread<x>] )
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ // 2nd op is [parallel, reduction] of 1x128, map the 1-dim to threadIdx.y to
+ // trigger mapping of the reduction to threadIdx.x via predication via `if (x==0)`.
+ %_3:2 =
+ transform.structured.tile_using_forall %block_combiner_op_2 tile_sizes [1]
+ ( mapping = [#gpu.thread<y>] )
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- // Step 3. Second level of tiling parallelizes to threads.
- // ===========================================================================
- // 1st op is [parallel, parallel], map it to threadIdx.x by 4.
- %_2:2 =
- transform.structured.tile_using_forall %block_more_parallel_fill_op_2 tile_sizes [0, 4]
- ( mapping = [#gpu.thread<x>] )
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- // 2nd op is [parallel, reduction] of 1x128, map the 1-dim to threadIdx.y to
- // trigger mapping of the reduction to threadIdx.x via predication via `if (x==0)`.
- %_3:2 =
- transform.structured.tile_using_forall %block_combiner_op_2 tile_sizes [1]
- ( mapping = [#gpu.thread<y>] )
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ // Step 4. Rank-reduce and vectorize.
+ // ===========================================================================
+ %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %func {
+ transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+ transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+ transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+ } : !transform.any_op
+ %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
- // Step 4. Rank-reduce and vectorize.
- // ===========================================================================
- %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %func {
- transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
- transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
- transform.apply_patterns.vector.cast_away_vector_leading_one_dim
- } : !transform.any_op
- %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
+ // Step 5. Bufferize and drop HAL decriptor from memref ops.
+ // ===========================================================================
+ // Canonicalization/CSE is needed before bufferization otherwise unnecessary
+ // allocs will be created.
+ transform.apply_patterns to %func_3 {
+ transform.apply_patterns.iree.fold_fill_into_pad
+ transform.apply_patterns.linalg.tiling_canonicalization
+ transform.apply_patterns.scf.for_loop_canonicalization
+ } : !transform.any_op
+ transform.apply_patterns to %func_3 {
+ transform.apply_patterns.tensor.reassociative_reshape_folding
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.iree.apply_cse %func_3 : !transform.any_op
+ transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+ %func_5 = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %func_5 {
+ transform.apply_patterns.linalg.erase_unnecessary_inputs
+ } : !transform.any_op
+ %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
+ %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- // Step 5. Bufferize and drop HAL decriptor from memref ops.
- // ===========================================================================
- // Canonicalization/CSE is needed before bufferization otherwise unnecessary
- // allocs will be created.
- transform.apply_patterns to %func_3 {
- transform.apply_patterns.iree.fold_fill_into_pad
- transform.apply_patterns.linalg.tiling_canonicalization
- transform.apply_patterns.scf.for_loop_canonicalization
- } : !transform.any_op
- transform.apply_patterns to %func_3 {
- transform.apply_patterns.tensor.reassociative_reshape_folding
- transform.apply_patterns.canonicalization
- } : !transform.any_op
- transform.iree.apply_cse %func_3 : !transform.any_op
- transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
- %func_5 = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %func_5 {
- transform.apply_patterns.linalg.erase_unnecessary_inputs
- } : !transform.any_op
- %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
- %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ // Step 6. Post-bufferization mapping to blocks and threads.
+ // ===========================================================================
+ %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
+ transform.iree.map_nested_forall_to_gpu_threads %func_7
+ workgroup_dims = [32, 1, 1] : (!transform.any_op) -> ()
- // Step 6. Post-bufferization mapping to blocks and threads.
- // ===========================================================================
- %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
- transform.iree.map_nested_forall_to_gpu_threads %func_7
- workgroup_dims = [32, 1, 1] : (!transform.any_op) -> ()
+ // Step 7. Post-bufferization vector distribution with rank-reduction.
+ // ===========================================================================
+ transform.apply_patterns to %func_7 {
+ transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+ transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+ transform.apply_patterns.memref.fold_memref_alias_ops
+ transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+ } : !transform.any_op
+ %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3
+ : (!transform.any_op) -> !transform.any_op
+ %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 } : (!transform.any_op) -> !transform.any_op
+ transform.iree.vector.warp_distribute %func_7
+ : (!transform.any_op) -> ()
- // Step 7. Post-bufferization vector distribution with rank-reduction.
- // ===========================================================================
- transform.apply_patterns to %func_7 {
- transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
- transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
- transform.apply_patterns.memref.fold_memref_alias_ops
- transform.apply_patterns.vector.cast_away_vector_leading_one_dim
- } : !transform.any_op
- %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3
- : (!transform.any_op) -> !transform.any_op
- %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 } : (!transform.any_op) -> !transform.any_op
- transform.iree.vector.warp_distribute %func_7
- : (!transform.any_op) -> ()
-
- // Late canonicalizations to cleanup and pass the checks
- transform.apply_patterns to %func_7 {
- transform.apply_patterns.iree.fold_fill_into_pad
- transform.apply_patterns.linalg.tiling_canonicalization
- transform.apply_patterns.scf.for_loop_canonicalization
- transform.apply_patterns.canonicalization
- } : !transform.any_op
- transform.iree.apply_licm %func_7 : !transform.any_op
- transform.iree.apply_cse %func_7 : !transform.any_op
-}
+ // Late canonicalizations to cleanup and pass the checks
+ transform.apply_patterns to %func_7 {
+ transform.apply_patterns.iree.fold_fill_into_pad
+ transform.apply_patterns.linalg.tiling_canonicalization
+ transform.apply_patterns.scf.for_loop_canonicalization
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.iree.apply_licm %func_7 : !transform.any_op
+ transform.iree.apply_cse %func_7 : !transform.any_op
+
+ transform.yield
+ }
+} // module
diff --git a/tests/transform_dialect/cuda/reduction_v2_uneven.mlir b/tests/transform_dialect/cuda/reduction_v2_uneven.mlir
index ebc5d8a..29b2d48 100644
--- a/tests/transform_dialect/cuda/reduction_v2_uneven.mlir
+++ b/tests/transform_dialect/cuda/reduction_v2_uneven.mlir
@@ -25,12 +25,14 @@
// RUN: --iree-hal-configuration-pipeline | \
// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-lower-executable-target)))' \
// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN: --iree-codegen-use-transform-dialect-strategy=%p/reduction_v2_codegen_spec.mlir | \
+// RUN: --iree-codegen-transform-dialect-library=%p/reduction_v2_codegen_spec.mlir \
+// RUN: --iree-codegen-use-transform-dialect-strategy=codegen | \
// RUN: FileCheck %s --check-prefix=CHECK
// RUN: iree-compile %s --iree-hal-target-backends=cuda \
// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN: --iree-codegen-use-transform-dialect-strategy=%p/reduction_v2_codegen_spec.mlir | \
+// RUN: --iree-codegen-transform-dialect-library=%p/reduction_v2_codegen_spec.mlir \
+// RUN: --iree-codegen-use-transform-dialect-strategy=codegen | \
// RUN: iree-run-module --module=- --function=reduce --device=cuda --input="33x34567xf32=1" |\
// RUN: FileCheck %s --check-prefix=EXEC
diff --git a/tests/transform_dialect/cuda/softmax.mlir b/tests/transform_dialect/cuda/softmax.mlir
index 299b5cb..27464db 100644
--- a/tests/transform_dialect/cuda/softmax.mlir
+++ b/tests/transform_dialect/cuda/softmax.mlir
@@ -6,7 +6,8 @@
// RUN: --iree-stream-transformation-pipeline \
// RUN: --iree-hal-configuration-pipeline | \
// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-lower-executable-target)))' \
-// RUN: --iree-codegen-use-transform-dialect-strategy=%p/softmax_codegen_spec.mlir \
+// RUN: --iree-codegen-transform-dialect-library=%p/softmax_codegen_spec.mlir \
+// RUN: --iree-codegen-use-transform-dialect-strategy=codegen \
// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false | \
// RUN: FileCheck %s --check-prefix=CHECK-SHUFFLE
@@ -16,7 +17,8 @@
// RUN: --iree-opt-const-expr-hoisting=false --iree-opt-const-eval=false \
// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
// RUN: --iree-flow-dispatch-use-transform-dialect=%p/softmax_dispatch_spec.mlir \
-// RUN: --iree-codegen-use-transform-dialect-strategy=%p/softmax_codegen_spec.mlir | \
+// RUN: --iree-codegen-transform-dialect-library=%p/softmax_codegen_spec.mlir \
+// RUN: --iree-codegen-use-transform-dialect-strategy=codegen | \
// RUN: iree-run-module --module=- --function=softmax --device=cuda | \
// RUN: FileCheck %s
diff --git a/tests/transform_dialect/cuda/softmax_codegen_spec.mlir b/tests/transform_dialect/cuda/softmax_codegen_spec.mlir
index 4c71f83..345be1f 100644
--- a/tests/transform_dialect/cuda/softmax_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/softmax_codegen_spec.mlir
@@ -1,109 +1,114 @@
// RUN: iree-opt %s
// Codegen
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
- %ops = transform.structured.match ops{["linalg.fill", "linalg.generic"]}
- in %variant_op : (!transform.any_op) -> !transform.any_op
- %input_max_fill,
- %input_max,
- %exps_sum_fill,
- %exps,
- %exps_sum,
- %div = transform.split_handle %ops
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op,
- !transform.any_op, !transform.any_op, !transform.any_op)
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @codegen(
+ %variant_op: !transform.any_op {transform.consumed}) {
- // Step 1. First level of tiling + fusion parallelizes to blocks.
- // ==============================================================
- %_, %forall =
- transform.structured.tile_using_forall %div tile_sizes [1, 4]
- ( mapping = [#gpu.block<x>, #gpu.block<y>] )
+ %ops = transform.structured.match ops{["linalg.fill", "linalg.generic"]}
+ in %variant_op : (!transform.any_op) -> !transform.any_op
+ %input_max_fill,
+ %input_max,
+ %exps_sum_fill,
+ %exps,
+ %exps_sum,
+ %div = transform.split_handle %ops
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op,
+ !transform.any_op, !transform.any_op, !transform.any_op)
+
+ // Step 1. First level of tiling + fusion parallelizes to blocks.
+ // ==============================================================
+ %_, %forall =
+ transform.structured.tile_using_forall %div tile_sizes [1, 4]
+ ( mapping = [#gpu.block<x>, #gpu.block<y>] )
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall : (!transform.any_op) -> ()
+
+ // TODO: Merging and fusing merged handles does not work properly atm.
+ transform.structured.fuse_into_containing_op %exps_sum into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.structured.fuse_into_containing_op %exps into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.structured.fuse_into_containing_op %exps_sum_fill into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.structured.fuse_into_containing_op %input_max into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.structured.fuse_into_containing_op %input_max_fill into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ // By default, fusion into scf.forall does not promote captured values
+ // to shared as this involves a cross-thread dependence analysis.
+ // Instead, we activate it explicitly post-hoc to promote all the extract_slice
+ // ops that we find and match the prerequisites
+ %forall_with_type = transform.cast %forall : !transform.any_op to !transform.op<"scf.forall">
+ transform.iree.share_forall_operands %forall_with_type
+ : (!transform.op<"scf.forall">) -> !transform.op<"scf.forall">
+ transform.apply_patterns to %variant_op {
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.iree.apply_cse %variant_op : !transform.any_op
+
+ // Step 2. Second level of tiling + fusion parallelizes to threads.
+ // ================================================================
+ %tiled_ops = transform.structured.match ops{["linalg.fill", "linalg.generic"]}
+ in %variant_op : (!transform.any_op) -> !transform.any_op
+ %tiled_input_max_fill,
+ %tiled_input_max,
+ %tiled_exps_sum_fill,
+ %tiled_exp_and_exps_sum,
+ %tiled_exp_and_exps_sum_2,
+ %tiled_div = transform.split_handle %tiled_ops
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op,
+ !transform.any_op, !transform.any_op, !transform.any_op)
+ // Leaving the reduction untiled on threadIdx.x makes it sequential on
+ // threadIdx.x. After distribution, predication by `if (threadIdx.x == 0)` is
+ // introduced and opportunities for distributing vector ops across warps
+ // appear.
+ %reduction_linalg_ops = transform.merge_handles %tiled_input_max,
+ %tiled_exp_and_exps_sum,
+ %tiled_exp_and_exps_sum_2
+ : !transform.any_op
+ transform.structured.tile_using_forall %reduction_linalg_ops tile_sizes [1, 1]
+ ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
: (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall : (!transform.any_op) -> ()
+ // Fully parallel ops are tiled and mapped.
+ %parallel_linalg_ops = transform.merge_handles %tiled_input_max_fill,
+ %tiled_exps_sum_fill,
+ %tiled_div
+ : !transform.any_op
+ transform.structured.tile_using_forall %parallel_linalg_ops num_threads [1, 4, 32]
+ ( mapping = [#gpu.thread<z>, #gpu.thread<y>, #gpu.thread<x>] )
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- // TODO: Merging and fusing merged handles does not work properly atm.
- transform.structured.fuse_into_containing_op %exps_sum into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.structured.fuse_into_containing_op %exps into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.structured.fuse_into_containing_op %exps_sum_fill into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.structured.fuse_into_containing_op %input_max into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.structured.fuse_into_containing_op %input_max_fill into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
- // By default, fusion into scf.forall does not promote captured values
- // to shared as this involves a cross-thread dependence analysis.
- // Instead, we activate it explicitly post-hoc to promote all the extract_slice
- // ops that we find and match the prerequisites
- %forall_with_type = transform.cast %forall : !transform.any_op to !transform.op<"scf.forall">
- transform.iree.share_forall_operands %forall_with_type
- : (!transform.op<"scf.forall">) -> !transform.op<"scf.forall">
- transform.apply_patterns to %variant_op {
- transform.apply_patterns.canonicalization
- } : !transform.any_op
- transform.iree.apply_cse %variant_op : !transform.any_op
+ // Step 3. Rank-reduce and vectorize.
+ // ==================================
+ %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %func {
+ transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+ transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+ transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+ } : !transform.any_op
+ transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
- // Step 2. Second level of tiling + fusion parallelizes to threads.
- // ================================================================
- %tiled_ops = transform.structured.match ops{["linalg.fill", "linalg.generic"]}
- in %variant_op : (!transform.any_op) -> !transform.any_op
- %tiled_input_max_fill,
- %tiled_input_max,
- %tiled_exps_sum_fill,
- %tiled_exp_and_exps_sum,
- %tiled_exp_and_exps_sum_2,
- %tiled_div = transform.split_handle %tiled_ops
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op,
- !transform.any_op, !transform.any_op, !transform.any_op)
- // Leaving the reduction untiled on threadIdx.x makes it sequential on
- // threadIdx.x. After distribution, predication by `if (threadIdx.x == 0)` is
- // introduced and opportunities for distributing vector ops across warps
- // appear.
- %reduction_linalg_ops = transform.merge_handles %tiled_input_max,
- %tiled_exp_and_exps_sum,
- %tiled_exp_and_exps_sum_2
- : !transform.any_op
- transform.structured.tile_using_forall %reduction_linalg_ops tile_sizes [1, 1]
- ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- // Fully parallel ops are tiled and mapped.
- %parallel_linalg_ops = transform.merge_handles %tiled_input_max_fill,
- %tiled_exps_sum_fill,
- %tiled_div
- : !transform.any_op
- transform.structured.tile_using_forall %parallel_linalg_ops num_threads [1, 4, 32]
- ( mapping = [#gpu.thread<z>, #gpu.thread<y>, #gpu.thread<x>] )
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ // Step 4. Bufferize and drop HAL decriptor from memref ops.
+ // =========================================================
+ transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+ %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> !transform.any_op
+ %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- // Step 3. Rank-reduce and vectorize.
- // ==================================
- %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %func {
- transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
- transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
- transform.apply_patterns.vector.cast_away_vector_leading_one_dim
- } : !transform.any_op
- transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
+ // Step 5. Post-bufferization mapping to blocks and threads.
+ // =========================================================
+ %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ transform.iree.forall_to_workgroup %func_2 : (!transform.any_op) -> ()
+ transform.iree.map_nested_forall_to_gpu_threads %func_2 workgroup_dims = [32, 4, 1] : (!transform.any_op) -> ()
- // Step 4. Bufferize and drop HAL decriptor from memref ops.
- // =========================================================
- transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
- %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> !transform.any_op
- %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ // Step 6. Post-bufferization vector distribution with rank-reduction.
+ // ===================================================================
+ %end_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %end_func {
+ transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+ transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+ transform.apply_patterns.memref.fold_memref_alias_ops
+ transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+ } : !transform.any_op
+ %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 } : (!transform.any_op) -> !transform.any_op
+ transform.iree.vector.warp_distribute %end_func : (!transform.any_op) -> ()
- // Step 5. Post-bufferization mapping to blocks and threads.
- // =========================================================
- %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- transform.iree.forall_to_workgroup %func_2 : (!transform.any_op) -> ()
- transform.iree.map_nested_forall_to_gpu_threads %func_2 workgroup_dims = [32, 4, 1] : (!transform.any_op) -> ()
-
- // Step 6. Post-bufferization vector distribution with rank-reduction.
- // ===================================================================
- %end_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %end_func {
- transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
- transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
- transform.apply_patterns.memref.fold_memref_alias_ops
- transform.apply_patterns.vector.cast_away_vector_leading_one_dim
- } : !transform.any_op
- %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 } : (!transform.any_op) -> !transform.any_op
- transform.iree.vector.warp_distribute %end_func : (!transform.any_op) -> ()
-}
+ transform.yield
+ }
+} // module
diff --git a/tests/transform_dialect/cuda/softmax_partial.mlir b/tests/transform_dialect/cuda/softmax_partial.mlir
index 6f4ca42..91032cb 100644
--- a/tests/transform_dialect/cuda/softmax_partial.mlir
+++ b/tests/transform_dialect/cuda/softmax_partial.mlir
@@ -5,7 +5,8 @@
// RUN: --iree-stream-transformation-pipeline \
// RUN: --iree-hal-configuration-pipeline | \
// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-lower-executable-target)))' \
-// RUN: --iree-codegen-use-transform-dialect-strategy=%p/softmax_partial_codegen_spec.mlir \
+// RUN: --iree-codegen-transform-dialect-library=%p/softmax_partial_codegen_spec.mlir \
+// RUN: --iree-codegen-use-transform-dialect-strategy=codegen \
// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false | \
// RUN: FileCheck %s --check-prefix=CHECK-SHUFFLE
@@ -14,7 +15,8 @@
/// Constant JIT'ing must be disabled because the transform-dialect debug
/// flags leak to the JIT session, which doesn't know what to do with them.
// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN: --iree-codegen-use-transform-dialect-strategy=%p/softmax_partial_codegen_spec.mlir | \
+// RUN: --iree-codegen-transform-dialect-library=%p/softmax_partial_codegen_spec.mlir \
+// RUN: --iree-codegen-use-transform-dialect-strategy=codegen | \
// RUN: iree-run-module --module=- --function=softmax_partial --device=cuda | \
// RUN: FileCheck %s
diff --git a/tests/transform_dialect/cuda/softmax_partial_codegen_spec.mlir b/tests/transform_dialect/cuda/softmax_partial_codegen_spec.mlir
index 5f8175c..7c1564f 100644
--- a/tests/transform_dialect/cuda/softmax_partial_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/softmax_partial_codegen_spec.mlir
@@ -1,93 +1,97 @@
// RUN: iree-opt %s
// Codegen
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @codegen(
+ %variant_op: !transform.any_op {transform.consumed}) {
- // Step 1. First level of tiling + fusion parallelizes to blocks.
- // ==============================================================
- %root = transform.structured.match interface{LinalgOp}
- attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>]} in %variant_op : (!transform.any_op) -> !transform.any_op
- %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- %red = transform.structured.match interface{LinalgOp}
- attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op : (!transform.any_op) -> !transform.any_op
- %not_root = merge_handles %fill, %red : !transform.any_op
- %tiled_generic, %forall =
- transform.structured.tile_using_forall %root tile_sizes [1, 4]
- ( mapping = [#gpu.block<x>, #gpu.block<y>] )
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall : (!transform.any_op) -> ()
- transform.structured.fuse_into_containing_op %not_root into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
- // Step 2. Second level of tiling + fusion parallelizes to threads.
- // ================================================================
- %fill_linalg = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- %reduction_linalg = transform.structured.match ops{["linalg.generic"]}
- attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op : (!transform.any_op) -> !transform.any_op
- %parallel_linalg = transform.structured.match ops{["linalg.generic"]}
- attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>]} in %variant_op : (!transform.any_op) -> !transform.any_op
- %tiled_reduction_generic, %forall_reduction =
- transform.structured.tile_using_forall %reduction_linalg tile_sizes [1, 1]
- ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
+ // Step 1. First level of tiling + fusion parallelizes to blocks.
+ // ==============================================================
+ %root = transform.structured.match interface{LinalgOp}
+ attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ %red = transform.structured.match interface{LinalgOp}
+ attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ %not_root = transform.merge_handles %fill, %red : !transform.any_op
+ %tiled_generic, %forall =
+ transform.structured.tile_using_forall %root tile_sizes [1, 4]
+ ( mapping = [#gpu.block<x>, #gpu.block<y>] )
: (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- // TODO: this fusion currently does not happen properly, this is related to the clone
- // behavior when fusing into scf.forall.
- // Once fixed we'll be able to fuse.
- // Fusion will save us one roundtrip to memory.
- // transform.structured.fuse_into_containing_op %fill_linalg into %forall_reduction
- transform.structured.tile_using_forall %parallel_linalg num_threads [1, 4, 32]
- ( mapping = [#gpu.thread<z>, #gpu.thread<y>, #gpu.thread<x>] )
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall : (!transform.any_op) -> ()
+ transform.structured.fuse_into_containing_op %not_root into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+ // Step 2. Second level of tiling + fusion parallelizes to threads.
+ // ================================================================
+ %fill_linalg = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ %reduction_linalg = transform.structured.match ops{["linalg.generic"]}
+ attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ %parallel_linalg = transform.structured.match ops{["linalg.generic"]}
+ attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ %tiled_reduction_generic, %forall_reduction =
+ transform.structured.tile_using_forall %reduction_linalg tile_sizes [1, 1]
+ ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ // TODO: this fusion currently does not happen properly, this is related to the clone
+ // behavior when fusing into scf.forall.
+ // Once fixed we'll be able to fuse.
+ // Fusion will save us one roundtrip to memory.
+ // transform.structured.fuse_into_containing_op %fill_linalg into %forall_reduction
+ transform.structured.tile_using_forall %parallel_linalg num_threads [1, 4, 32]
+ ( mapping = [#gpu.thread<z>, #gpu.thread<y>, #gpu.thread<x>] )
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- // Inability to tile reductions to scf.forall has 2 implications:
- // 1. since no scf.forall is present, no gpu.barrier is added.
- // This should be fixed independently: ops that are not nested in an scf.forall
- // should have a gpu.barrier. Later needs to be complemented by a barrier
- // removal pass.
- // 2. Similarly, needs to be predicated under an if threadIx == 0 to avoid
- // multiple threads updating the buffer inplace once bufferized.
- //
- // Instead, we can vectorize and go to vector SSA values that sidestep these
- // issues.
- // Everyone will race to the write while still computing the same value.
- //
- // That is still not good enough because we need to predicate this in order
- // to enable the parallel reduction on warps.
+ // Inability to tile reductions to scf.forall has 2 implications:
+ // 1. since no scf.forall is present, no gpu.barrier is added.
+ // This should be fixed independently: ops that are not nested in an scf.forall
+ // should have a gpu.barrier. Later needs to be complemented by a barrier
+ // removal pass.
+ // 2. Similarly, needs to be predicated under an if threadIx == 0 to avoid
+ // multiple threads updating the buffer inplace once bufferized.
+ //
+ // Instead, we can vectorize and go to vector SSA values that sidestep these
+ // issues.
+ // Everyone will race to the write while still computing the same value.
+ //
+ // That is still not good enough because we need to predicate this in order
+ // to enable the parallel reduction on warps.
- // Step 3. Rank-reduce and vectorize.
- // ==================================
- %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %func {
- transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
- transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
- transform.apply_patterns.vector.cast_away_vector_leading_one_dim
- } : !transform.any_op
- transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
+ // Step 3. Rank-reduce and vectorize.
+ // ==================================
+ %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %func {
+ transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+ transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+ transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+ } : !transform.any_op
+ transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
- // Step 4. Bufferize and drop HAL decriptor from memref ops.
- // =========================================================
- transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
- %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> !transform.any_op
- %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ // Step 4. Bufferize and drop HAL decriptor from memref ops.
+ // =========================================================
+ transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+ %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> !transform.any_op
+ %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- // Step 5. Post-bufferization mapping to blocks and threads.
- // =========================================================
- %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- transform.iree.forall_to_workgroup %func_2 : (!transform.any_op) -> ()
- transform.iree.map_nested_forall_to_gpu_threads %func_2 workgroup_dims = [32, 4, 1] : (!transform.any_op) -> ()
+ // Step 5. Post-bufferization mapping to blocks and threads.
+ // =========================================================
+ %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ transform.iree.forall_to_workgroup %func_2 : (!transform.any_op) -> ()
+ transform.iree.map_nested_forall_to_gpu_threads %func_2 workgroup_dims = [32, 4, 1] : (!transform.any_op) -> ()
- // Step 6. Post-bufferization vector distribution with rank-reduction.
- // ===================================================================
- %end_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %end_func {
- transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
- transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
- transform.apply_patterns.memref.fold_memref_alias_ops
- transform.apply_patterns.vector.cast_away_vector_leading_one_dim
- } : !transform.any_op
- %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 }
- : (!transform.any_op) -> !transform.any_op
- transform.iree.vector.warp_distribute %end_func : (!transform.any_op) -> ()
-}
+ // Step 6. Post-bufferization vector distribution with rank-reduction.
+ // ===================================================================
+ %end_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %end_func {
+ transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+ transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+ transform.apply_patterns.memref.fold_memref_alias_ops
+ transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+ } : !transform.any_op
+ %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 }
+ : (!transform.any_op) -> !transform.any_op
+ transform.iree.vector.warp_distribute %end_func : (!transform.any_op) -> ()
+
+ transform.yield
+ }
+} // module
diff --git a/tests/transform_dialect/cuda/softmax_v2.mlir b/tests/transform_dialect/cuda/softmax_v2.mlir
index 2a556d1..07e3c28 100644
--- a/tests/transform_dialect/cuda/softmax_v2.mlir
+++ b/tests/transform_dialect/cuda/softmax_v2.mlir
@@ -6,7 +6,8 @@
// RUN: --iree-hal-configuration-pipeline | \
// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-lower-executable-target)))' \
// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN: --iree-codegen-use-transform-dialect-strategy=%p/softmax_v2_codegen_spec.mlir | \
+// RUN: --iree-codegen-transform-dialect-library=%p/softmax_v2_codegen_spec.mlir \
+// RUN: --iree-codegen-use-transform-dialect-strategy=codegen | \
// RUN: FileCheck %s --check-prefix=CHECK-SHUFFLE
// RUN: iree-compile %s --iree-hal-target-backends=cuda \
@@ -15,7 +16,8 @@
/// flags leak to the JIT session, which doesn't know what to do with them.
// RUN: --iree-flow-fuse-multi-use \
// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN: --iree-codegen-use-transform-dialect-strategy=%p/softmax_v2_codegen_spec.mlir | \
+// RUN: --iree-codegen-transform-dialect-library=%p/softmax_v2_codegen_spec.mlir \
+// RUN: --iree-codegen-use-transform-dialect-strategy=codegen | \
// RUN: iree-run-module --module=- --function=softmax --device=cuda | \
// RUN: FileCheck %s
diff --git a/tests/transform_dialect/cuda/softmax_v2_codegen_spec.mlir b/tests/transform_dialect/cuda/softmax_v2_codegen_spec.mlir
index dda89bf..67e3cb3 100644
--- a/tests/transform_dialect/cuda/softmax_v2_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/softmax_v2_codegen_spec.mlir
@@ -1,138 +1,143 @@
// RUN: iree-opt %s
// Codegen
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
- %ops = transform.structured.match ops{["linalg.fill", "linalg.generic"]}
- in %variant_op : (!transform.any_op) -> !transform.any_op
- %input_max_fill,
- %input_max,
- %exps_sum_fill,
- %exp_and_exps_sum,
- %div = transform.split_handle %ops
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op,
- !transform.any_op, !transform.any_op)
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @codegen(
+ %variant_op: !transform.any_op {transform.consumed}) {
- // Step 1. First level of tiling + fusion parallelizes to blocks.
- // ==============================================================
- %_, %forall =
- transform.structured.tile_using_forall %div tile_sizes [1, 4]
- ( mapping = [#gpu.block<x>, #gpu.block<y>] )
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall : (!transform.any_op) -> ()
+ %ops = transform.structured.match ops{["linalg.fill", "linalg.generic"]}
+ in %variant_op : (!transform.any_op) -> !transform.any_op
+ %input_max_fill,
+ %input_max,
+ %exps_sum_fill,
+ %exp_and_exps_sum,
+ %div = transform.split_handle %ops
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op,
+ !transform.any_op, !transform.any_op)
- // TODO: Merging and fusing merged handles does not work properly atm.
- transform.structured.fuse_into_containing_op %exp_and_exps_sum into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.structured.fuse_into_containing_op %exps_sum_fill into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.structured.fuse_into_containing_op %input_max into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.structured.fuse_into_containing_op %input_max_fill into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
- // By default, fusion into scf.forall does not promote captured values
- // to shared as this involves a cross-thread dependence analysis.
- // Instead, we activate it explicitly post-hoc to promote all the extract_slice
- // ops that we find and match the prerequisites
- %forall_with_type = transform.cast %forall : !transform.any_op to !transform.op<"scf.forall">
- transform.iree.share_forall_operands %forall_with_type
- : (!transform.op<"scf.forall">) -> !transform.op<"scf.forall">
+ // Step 1. First level of tiling + fusion parallelizes to blocks.
+ // ==============================================================
+ %_, %forall =
+ transform.structured.tile_using_forall %div tile_sizes [1, 4]
+ ( mapping = [#gpu.block<x>, #gpu.block<y>] )
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall : (!transform.any_op) -> ()
- // Canonicalizations.
- %func_op = transform.structured.match ops{["func.func"]} in %variant_op
+ // TODO: Merging and fusing merged handles does not work properly atm.
+ transform.structured.fuse_into_containing_op %exp_and_exps_sum into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.structured.fuse_into_containing_op %exps_sum_fill into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.structured.fuse_into_containing_op %input_max into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.structured.fuse_into_containing_op %input_max_fill into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ // By default, fusion into scf.forall does not promote captured values
+ // to shared as this involves a cross-thread dependence analysis.
+ // Instead, we activate it explicitly post-hoc to promote all the extract_slice
+ // ops that we find and match the prerequisites
+ %forall_with_type = transform.cast %forall : !transform.any_op to !transform.op<"scf.forall">
+ transform.iree.share_forall_operands %forall_with_type
+ : (!transform.op<"scf.forall">) -> !transform.op<"scf.forall">
+
+ // Canonicalizations.
+ %func_op = transform.structured.match ops{["func.func"]} in %variant_op
+ : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %func_op {
+ transform.apply_patterns.iree.fold_fill_into_pad
+ transform.apply_patterns.linalg.tiling_canonicalization
+ transform.apply_patterns.scf.for_loop_canonicalization
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.iree.apply_licm %func_op : !transform.any_op
+ transform.iree.apply_cse %func_op : !transform.any_op
+
+
+ // Step 2. Second level of tiling + fusion parallelizes to threads.
+ // ================================================================
+ %tiled_ops = transform.structured.match ops{["linalg.fill", "linalg.generic"]}
+ in %variant_op : (!transform.any_op) -> !transform.any_op
+ %tiled_input_max_fill,
+ %tiled_input_max,
+ %tiled_exps_sum_fill,
+ %tiled_exp_and_exps_sum,
+ %tiled_div = transform.split_handle %tiled_ops
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op,
+ !transform.any_op, !transform.any_op)
+ // Leaving the reduction untiled on threadIdx.x makes it sequential on
+ // threadIdx.x. After distribution, predication by `if (threadIdx.x == 0)` is
+ // introduced and opportunities for distributing vector ops across warps
+ // appear.
+ %reduction_linalg_ops = transform.merge_handles %tiled_input_max,
+ %tiled_exp_and_exps_sum
+ : !transform.any_op
+ transform.structured.tile_using_forall %reduction_linalg_ops tile_sizes [1, 1]
+ ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ // Fully parallel ops are tiled and mapped.
+ %parallel_linalg_ops = transform.merge_handles %tiled_input_max_fill,
+ %tiled_exps_sum_fill,
+ %tiled_div
+ : !transform.any_op
+ transform.structured.tile_using_forall %parallel_linalg_ops num_threads [1, 4, 32]
+ ( mapping = [#gpu.thread<z>, #gpu.thread<y>, #gpu.thread<x>] )
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+ // Canonicalizations.
+ transform.apply_patterns to %func_op {
+ transform.apply_patterns.iree.fold_fill_into_pad
+ transform.apply_patterns.linalg.tiling_canonicalization
+ transform.apply_patterns.scf.for_loop_canonicalization
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.iree.apply_licm %func_op : !transform.any_op
+ transform.iree.apply_cse %func_op : !transform.any_op
+
+ // Step 3. Rank-reduce and vectorize.
+ // ==================================
+ transform.apply_patterns to %func_op {
+ transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+ transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+ transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+ } : !transform.any_op
+ transform.structured.vectorize_children_and_apply_patterns %func_op : (!transform.any_op) -> !transform.any_op
+
+ // Step 4. Bufferize and drop HAL decriptor from memref ops.
+ // =========================================================
+ transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+ %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> !transform.any_op
+ %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+
+ // Step 5. Post-bufferization mapping to blocks and threads.
+ // =========================================================
+ transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> ()
+ transform.iree.map_nested_forall_to_gpu_threads %memref_func
+ workgroup_dims = [32, 4, 1] : (!transform.any_op) -> ()
+
+ // Step 6. Post-bufferization vector distribution with rank-reduction.
+ // ===================================================================
+ transform.apply_patterns to %memref_func {
+ transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+ transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+ transform.apply_patterns.memref.fold_memref_alias_ops
+ transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+ } : !transform.any_op
+ %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3
: (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %func_op {
- transform.apply_patterns.iree.fold_fill_into_pad
- transform.apply_patterns.linalg.tiling_canonicalization
- transform.apply_patterns.scf.for_loop_canonicalization
- transform.apply_patterns.canonicalization
- } : !transform.any_op
- transform.iree.apply_licm %func_op : !transform.any_op
- transform.iree.apply_cse %func_op : !transform.any_op
-
-
- // Step 2. Second level of tiling + fusion parallelizes to threads.
- // ================================================================
- %tiled_ops = transform.structured.match ops{["linalg.fill", "linalg.generic"]}
- in %variant_op : (!transform.any_op) -> !transform.any_op
- %tiled_input_max_fill,
- %tiled_input_max,
- %tiled_exps_sum_fill,
- %tiled_exp_and_exps_sum,
- %tiled_div = transform.split_handle %tiled_ops
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op,
- !transform.any_op, !transform.any_op)
- // Leaving the reduction untiled on threadIdx.x makes it sequential on
- // threadIdx.x. After distribution, predication by `if (threadIdx.x == 0)` is
- // introduced and opportunities for distributing vector ops across warps
- // appear.
- %reduction_linalg_ops = transform.merge_handles %tiled_input_max,
- %tiled_exp_and_exps_sum
- : !transform.any_op
- transform.structured.tile_using_forall %reduction_linalg_ops tile_sizes [1, 1]
- ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- // Fully parallel ops are tiled and mapped.
- %parallel_linalg_ops = transform.merge_handles %tiled_input_max_fill,
- %tiled_exps_sum_fill,
- %tiled_div
- : !transform.any_op
- transform.structured.tile_using_forall %parallel_linalg_ops num_threads [1, 4, 32]
- ( mapping = [#gpu.thread<z>, #gpu.thread<y>, #gpu.thread<x>] )
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
- // Canonicalizations.
- transform.apply_patterns to %func_op {
- transform.apply_patterns.iree.fold_fill_into_pad
- transform.apply_patterns.linalg.tiling_canonicalization
- transform.apply_patterns.scf.for_loop_canonicalization
- transform.apply_patterns.canonicalization
- } : !transform.any_op
- transform.iree.apply_licm %func_op : !transform.any_op
- transform.iree.apply_cse %func_op : !transform.any_op
-
- // Step 3. Rank-reduce and vectorize.
- // ==================================
- transform.apply_patterns to %func_op {
- transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
- transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
- transform.apply_patterns.vector.cast_away_vector_leading_one_dim
- } : !transform.any_op
- transform.structured.vectorize_children_and_apply_patterns %func_op : (!transform.any_op) -> !transform.any_op
-
- // Step 4. Bufferize and drop HAL decriptor from memref ops.
- // =========================================================
- transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
- %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> !transform.any_op
- %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-
- // Step 5. Post-bufferization mapping to blocks and threads.
- // =========================================================
- transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> ()
- transform.iree.map_nested_forall_to_gpu_threads %memref_func
- workgroup_dims = [32, 4, 1] : (!transform.any_op) -> ()
-
- // Step 6. Post-bufferization vector distribution with rank-reduction.
- // ===================================================================
- transform.apply_patterns to %memref_func {
- transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
- transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
- transform.apply_patterns.memref.fold_memref_alias_ops
- transform.apply_patterns.vector.cast_away_vector_leading_one_dim
- } : !transform.any_op
- %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3
- : (!transform.any_op) -> !transform.any_op
- %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 }
- : (!transform.any_op) -> !transform.any_op
- transform.iree.vector.warp_distribute %memref_func
- : (!transform.any_op) -> ()
-
-
- // Late canonicalizations.
- %func_op_3 = transform.structured.match ops{["func.func"]} in %variant_op_3
+ %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 }
: (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %func_op_3 {
- transform.apply_patterns.iree.fold_fill_into_pad
- transform.apply_patterns.linalg.tiling_canonicalization
- transform.apply_patterns.scf.for_loop_canonicalization
- transform.apply_patterns.canonicalization
- } : !transform.any_op
- transform.iree.apply_licm %func_op_3 : !transform.any_op
- transform.iree.apply_cse %func_op_3 : !transform.any_op
-}
+ transform.iree.vector.warp_distribute %memref_func
+ : (!transform.any_op) -> ()
+
+
+ // Late canonicalizations.
+ %func_op_3 = transform.structured.match ops{["func.func"]} in %variant_op_3
+ : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %func_op_3 {
+ transform.apply_patterns.iree.fold_fill_into_pad
+ transform.apply_patterns.linalg.tiling_canonicalization
+ transform.apply_patterns.scf.for_loop_canonicalization
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.iree.apply_licm %func_op_3 : !transform.any_op
+ transform.iree.apply_cse %func_op_3 : !transform.any_op
+
+ transform.yield
+ }
+} // module
diff --git a/tests/transform_dialect/cuda/vecadd2d.mlir b/tests/transform_dialect/cuda/vecadd2d.mlir
deleted file mode 100644
index 7e03154..0000000
--- a/tests/transform_dialect/cuda/vecadd2d.mlir
+++ /dev/null
@@ -1,84 +0,0 @@
-!type = tensor<9x512xf32>
-!type2 = tensor<512x9xf32>
-
-#trait = { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel"] }
-
-#trait2 = { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
- affine_map<(d0, d1) -> (d1, d0)>],
- iterator_types = ["parallel", "parallel"] }
-
-util.global private @"lhs" {noinline} = dense<0.0> : !type2
-util.global private @"rhs" {noinline} = dense<2.0> : !type
-
-func.func @vecadd2d() -> (!type2) {
- %cst0 = arith.constant 0.000000e+00 : f32
- %cst1 = arith.constant 2.000000e+00 : f32
-
- %x_ptr = util.global.address @"rhs" : !util.ptr<!type>
- %x = util.global.load.indirect %x_ptr : !util.ptr<!type> -> !type
- %y_ptr = util.global.address @"lhs" : !util.ptr<!type2>
- %y = util.global.load.indirect %y_ptr : !util.ptr<!type2> -> !type2
-
- // Note: Two linalg.generics to fill the tensors will make IREE generate two
- // separate kernels for the above and the below. It is important to validate
- // the results.
- %2 = linalg.generic #trait2 ins(%x : !type) outs(%y : !type2) {
- ^bb0(%arg3: f32, %arg4: f32):
- %3 = arith.addf %arg3, %arg4 : f32
- linalg.yield %3 : f32
- } -> !type2
-
- return %2 : !type2
-}
-
-// RUN: iree-opt %s --iree-hal-target-backends=cuda \
-// RUN: --iree-abi-transformation-pipeline \
-// RUN: --iree-flow-transformation-pipeline \
-// RUN: --iree-stream-transformation-pipeline \
-// RUN: --iree-hal-configuration-pipeline | \
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-lower-executable-target)))' \
-// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN: --iree-codegen-use-transform-dialect-strategy=%p/vecadd2d_codegen_spec.mlir | \
-// RUN: FileCheck %s --check-prefix=CHECK
-
-// RUN: iree-opt %s --iree-hal-target-backends=cuda \
-// RUN: --iree-abi-transformation-pipeline \
-// RUN: --iree-flow-transformation-pipeline \
-// RUN: --iree-stream-transformation-pipeline \
-// RUN: --iree-hal-configuration-pipeline | \
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-lower-executable-target)))' \
-// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN: --iree-codegen-use-transform-dialect-strategy=%p/vecadd2d_codegen_spec_partial_tile.mlir | \
-// RUN: FileCheck %s --check-prefix=CHECK-PARTIAL-TILE
-
-// RUN: iree-compile %s --iree-hal-target-backends=cuda \
-// RUN: --iree-opt-const-expr-hoisting=false --iree-opt-const-eval=false \
-/// Constant JIT'ing must be disabled because the transform-dialect debug
-/// flags leak to the JIT session, which doesn't know what to do with them.
-// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN: --iree-codegen-use-transform-dialect-strategy=%p/vecadd2d_codegen_spec.mlir | \
-// RUN: iree-run-module --module=- --function=vecadd2d --device=cuda |\
-// RUN: FileCheck %s --check-prefix=EXEC
-
-// CHECK: hal.executable.export
-// CHECK: bb0(%[[DEV:.*]]: !hal.device):
-// CHECK: %[[C171:.*]] = arith.constant 171 : index
-// CHECK: %[[C1:.*]] = arith.constant 1 : index
-// CHECK: %[[C2:.*]] = arith.constant 2 : index
-// CHECK: hal.return %[[C171]], %[[C1]], %[[C2]] : index, index, index
-
-// CHECK: %[[BLKZ:.*]] = hal.interface.workgroup.id[2] : index
-// CHECK: %[[BLKX:.*]] = hal.interface.workgroup.id[0] : index
-// CHECK: memref.subview %0[%[[BLKZ:.*]], %[[BLKX:.*]]]
-
-// CHECK-PARTIAL-TILE: hal.executable.export
-// CHECK-PARTIAL-TILE: bb0(%[[DEV:.*]]: !hal.device):
-// CHECK-PARTIAL-TILE: %[[C1:.*]] = arith.constant 1 : index
-// CHECK-PARTIAL-TILE: %[[C1_2:.*]] = arith.constant 1 : index
-// CHECK-PARTIAL-TILE: %[[C171:.*]] = arith.constant 171 : index
-// CHECK-PARTIAL-TILE: hal.return %[[C1]], %[[C1_2]], %[[C171]] : index, index, index
-
-// EXEC: EXEC @vecadd2d
-// EXEC: result[0]: hal.buffer_view
-// EXEC: 512x9xf32=[2 2 2 2 2 2 2 2 2][2 2 2 2 2 2 2 2 2]
diff --git a/tests/transform_dialect/cuda/vecadd2d_codegen_spec.mlir b/tests/transform_dialect/cuda/vecadd2d_codegen_spec.mlir
deleted file mode 100644
index 2f94296..0000000
--- a/tests/transform_dialect/cuda/vecadd2d_codegen_spec.mlir
+++ /dev/null
@@ -1,27 +0,0 @@
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
- // Step 1. Find three linalg.generics and tile to GPU thread blocks.
- // ===========================================================================
- %generics = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- %_, %forall_grid = transform.structured.tile_using_forall %generics
- tile_sizes [5, 3] ( mapping = [#gpu.block<z>, #gpu.block<x>])
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
-
-
- // Step 2. Rank reduce and bufferize and drop HAL decriptor from memref ops.
- // ===========================================================================
- %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %func {
- transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
- transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
- transform.apply_patterns.vector.cast_away_vector_leading_one_dim
- } : !transform.any_op
- transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
- %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> !transform.any_op
- %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-
- // Step 3. Map to GPU thread blocks.
- // ===========================================================================
- transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> ()
-}
diff --git a/tests/transform_dialect/cuda/vecadd2d_codegen_spec_partial_tile.mlir b/tests/transform_dialect/cuda/vecadd2d_codegen_spec_partial_tile.mlir
deleted file mode 100644
index fc373cb..0000000
--- a/tests/transform_dialect/cuda/vecadd2d_codegen_spec_partial_tile.mlir
+++ /dev/null
@@ -1,24 +0,0 @@
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
- %generics = transform.structured.match ops{["linalg.generic"]} in %variant_op
- : (!transform.any_op) -> !transform.any_op
- // Tile only one dimension, skip the other one.
- %_, %forall_grid = transform.structured.tile_using_forall %generics
- tile_sizes [0, 3] ( mapping = [#gpu.block<z>])
- : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
-
-
- // Late canonicalizations to cleanup and pass the checks.
- // Needs to occur on the whole variant to perform cse on the workgroup_count region
- %func_op = transform.structured.match ops{["func.func"]} in %variant_op
- : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %func_op {
- transform.apply_patterns.iree.fold_fill_into_pad
- transform.apply_patterns.linalg.tiling_canonicalization
- transform.apply_patterns.scf.for_loop_canonicalization
- transform.apply_patterns.canonicalization
- } : !transform.any_op
- transform.iree.apply_licm %func_op : !transform.any_op
- transform.iree.apply_cse %func_op : !transform.any_op
-}