Update the usage of the transform dialect interpreter (#15340) Upstream is shifting to using the simpler form of the transform dialect interpreter introduced in https://github.com/llvm/llvm-project/pull/68661 Update IREE's codegen usage of the interpreter as well as tests which now require a named sequence entry point.

commit: c0525ad9ec20e286b0145b797122464df8668d43 [log] [tgz]
author: Nicolas Vasilache <nicolasvasilache@users.noreply.github.com> Wed Nov 01 11:37:42 2023 +0100
committer: GitHub <noreply@github.com> Wed Nov 01 10:37:42 2023 +0000
tree: 66d17378a47b2790dd6eed32dc5d3723f7c1eed5
parent: 31125763d93b45f714ababe689bdd39cf1e02fbc [diff]
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel
index 7acdd4f..53d8c78 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel

@@ -6,8 +6,8 @@
 
 # Tests for common transforms.
 
-load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
 load("//build_tools/bazel:enforce_glob.bzl", "enforce_glob")
+load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
 
 package(
     features = ["layering_check"],

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/transform_gpu_workgroup_swizzle.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/transform_gpu_workgroup_swizzle.mlir
index fbed641..1982183 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/transform_gpu_workgroup_swizzle.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/transform_gpu_workgroup_swizzle.mlir

@@ -28,11 +28,15 @@
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.iree.workgroup_swizzle %0 { log_tile = 3 } : (!transform.any_op) -> ()
-}
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(
+      %variant_op: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    transform.iree.workgroup_swizzle %0 { log_tile = 3 } : (!transform.any_op) -> ()
+    transform.yield 
+  }
+} // module
+
 
 //    CHECK-LABEL: func.func @matmul
 //          CHECK: %[[WORKGROUPIDX:.*]] = hal.interface.workgroup.id[0] : index

diff --git a/compiler/src/iree/compiler/Codegen/Common/MaterializeUserConfigs.cpp b/compiler/src/iree/compiler/Codegen/Common/MaterializeUserConfigs.cpp
index 918b43a..88e9a15 100644
--- a/compiler/src/iree/compiler/Codegen/Common/MaterializeUserConfigs.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/MaterializeUserConfigs.cpp

@@ -4,29 +4,35 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+#include <iterator>
 #include "iree/compiler/Codegen/Common/PassDetail.h"
 #include "iree/compiler/Codegen/Common/Passes.h"
 #include "iree/compiler/Codegen/Common/UserConfig.h"
 #include "iree/compiler/Codegen/Dialect/IREECodegenAttrs.h"
 #include "iree/compiler/Codegen/Dialect/IREECodegenDialect.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/iterator_range.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/Transform/IR/TransformDialect.h"
 #include "mlir/Dialect/Transform/IR/TransformOps.h"
 #include "mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
-#define DEBUG_TYPE "iree-codegen-materialize-library-calls"
+#define DEBUG_TYPE "iree-codegen-materialize-user-configs"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
 
 namespace mlir {
 namespace iree_compiler {
 
-llvm::cl::opt<std::string> clCodegenTransformDialectTestName(
+llvm::cl::opt<std::string> clCodegenTransformDialectStrategyName(
     "iree-codegen-use-transform-dialect-strategy",
     llvm::cl::desc(
-        "Broadcasts the given transform dialect strategy specification to all"
-        "dispatches. Supports two modes; a path to the MLIR file containing a"
-        "transform dialect specification to apply, and a symbol reference to"
-        "load from a library of transform specs (@library_call)"),
+        "Broadcasts the given transform dialect strategy specification to all "
+        "dispatches. The specification is a symbol reference to load from a"
+        "library of transform specs (@library_call)"),
     llvm::cl::init(""));
 
 llvm::cl::opt<std::string> clCodegenTransformDialectLibraryFileName(
@@ -40,25 +46,6 @@
 
 static const char kTranslationInfoAttrName[] = "translation_info";
 
-static void createEmptyTransformStrategy(ModuleOp innerModule) {
-  Location loc = innerModule.getLoc();
-  OpBuilder b = OpBuilder::atBlockEnd(innerModule.getBody());
-  auto topLevelTransformModule = b.create<ModuleOp>(loc);
-  Region &topLevelTransformRegion = topLevelTransformModule.getBodyRegion();
-  b.setInsertionPointToStart(&topLevelTransformRegion.front());
-  auto anyOpType = transform::AnyOpType::get(b.getContext());
-
-  // Create the include for the named sequence with the expectation that the
-  // external definition will be linked in later.
-  auto sequence = b.create<transform::SequenceOp>(
-      loc, TypeRange{}, transform::FailurePropagationMode::Propagate, anyOpType,
-      [&](OpBuilder &b, Location loc, Value variantH) {
-        b.create<transform::PrintOp>(loc, variantH);
-        b.create<transform::YieldOp>(loc);
-      });
-  (void)sequence;
-}
-
 struct MaterializeUserConfigsPass
     : public MaterializeUserConfigsBase<MaterializeUserConfigsPass> {
   void getDependentDialects(DialectRegistry &registry) const override {
@@ -72,6 +59,7 @@
         getAllEntryPoints(moduleOp);
     MLIRContext *context = moduleOp.getContext();
 
+    LDBG("MaterializeUserConfigsPass on variant: " << variantOp);
     std::optional<ModuleOp> transformLibrary = std::nullopt;
     if (!clCodegenTransformDialectLibraryFileName.empty()) {
       auto dialect =
@@ -79,9 +67,13 @@
       auto maybeTransformLibrary = dialect->getOrLoadTransformLibraryModule(
           clCodegenTransformDialectLibraryFileName);
       if (failed(maybeTransformLibrary)) {
+        variantOp.emitError() << "failed to load transform library module: "
+                              << clCodegenTransformDialectLibraryFileName;
         return signalPassFailure();
       }
       transformLibrary = *maybeTransformLibrary;
+      LDBG("--found transform library @"
+           << clCodegenTransformDialectLibraryFileName);
     }
 
     IREE::Codegen::DispatchLoweringPassPipeline tdPipeline =
@@ -89,19 +81,27 @@
     std::optional<IREE::Codegen::TranslationInfoAttr> clTranslationInfo;
     // Here we always set the pipeline strategy to transform dialect if the
     // flag is non-empty to ensure we pick the right lowering pipeline in the
-    // event a file path is given.
-    if (!clCodegenTransformDialectTestName.empty()) {
+    // event a strategy symbol is defined.
+    if (!clCodegenTransformDialectLibraryFileName.empty() ||
+        !clCodegenTransformDialectStrategyName.empty()) {
+      StringRef strategyName =
+          (clCodegenTransformDialectStrategyName.empty())
+              ? StringRef(
+                    transform::TransformDialect::kTransformEntryPointSymbolName)
+              : clCodegenTransformDialectStrategyName;
       clTranslationInfo = IREE::Codegen::TranslationInfoAttr::get(
           context, tdPipeline,
           /*softwarePipelineDepth=*/0,
           /*softwarePipelineStoreStage=*/1,
-          /*codegenSpec=*/clCodegenTransformDialectTestName[0] == '@'
-              ? SymbolRefAttr::get(
-                    context, llvm::StringRef(
-                                 clCodegenTransformDialectTestName.substr(1)))
-              : SymbolRefAttr());
+          /*codegenSpec=*/
+          SymbolRefAttr::get(context, llvm::StringRef(strategyName)));
+      LDBG("--clTranslationInfo: " << clTranslationInfo);
     }
 
+    LDBG("--start iterating over: "
+         << std::distance(moduleOp.getOps<func::FuncOp>().begin(),
+                          moduleOp.getOps<func::FuncOp>().end())
+         << " functions");
     std::optional<IREE::Codegen::TranslationInfoAttr> translationInfo;
     for (auto funcOp : moduleOp.getOps<func::FuncOp>()) {
       auto exportOp = exportOps.lookup(funcOp.getName());
@@ -131,7 +131,7 @@
           /// Currently codegen is rooted on the variant, meaning every entry
           /// must go through the same codegen pipeline. For multi-targeting we
           /// will want to have multiple functions per variant, as well as
-          /// multple exports per variant, meaning eventually the nesting of
+          /// multiple exports per variant, meaning eventually the nesting of
           /// the translation pipeline will need to change to the function, or
           /// we'll need another level of module op nesting.
           if (exportedTranslationInfo != translationInfo.value()) {
@@ -160,6 +160,7 @@
       }
     }
 
+    LDBG("--guaranteed unique translationInfo: " << translationInfo);
     /// We only need to resolve symbols for transform dialect based strategies.
     if (!translationInfo ||
         translationInfo.value().getDispatchLoweringPassPipeline() !=
@@ -167,52 +168,38 @@
       return;
     }
 
-    std::optional<SymbolRefAttr> libraryFunc =
+    // From now on, we know we have a transform dialect strategy. We now need to
+    // ensure it can resolve and apply in a subsequent interpreter pass or else
+    // we need to fall back to codegen.
+    bool failedToResolve = false;
+    auto g = llvm::make_scope_exit([&]() {
+      if (!failedToResolve)
+        return;
+
+      exportOps = getAllEntryPoints(variantOp.getInnerModule());
+      for (auto &it : exportOps) {
+        auto exportOp = it.second;
+        if (getTranslationInfo(exportOp) == translationInfo) {
+          exportOp->removeAttr(kTranslationInfoAttrName);
+        }
+      }
+    });
+
+    std::optional<SymbolRefAttr> strategyName =
         translationInfo.value().getCodegenSpec();
-    if (!libraryFunc || *libraryFunc == SymbolRefAttr()) {
+    if (!strategyName || *strategyName == SymbolRefAttr()) {
+      failedToResolve = true;
       return;
     }
 
     /// If we have a symbol, verify the existence of the symbol within the
     /// transform library.
+    StringRef entryPoint = strategyName->getLeafReference();
     if (!transformLibrary || !(*transformLibrary) ||
         !transform::detail::findTransformEntryPoint(
-            variantOp, *transformLibrary, libraryFunc->getLeafReference())) {
+            variantOp, *transformLibrary, entryPoint)) {
       moduleOp.emitOpError("failed to find transform strategy symbol");
-      return signalPassFailure();
-    }
-
-    // TODO: At this point we could allow the user to (optionally) return a
-    // translation info attribute to use, however there currently isn't a way
-    // upstream to retrieve the results of the named sequence.
-
-    /// Attempt to execute the strategy.  symbol (from the flag or otherwise) at
-    /// the same time. Because the strategy is rooted on the variant op, the
-    /// strategy can change the translation info on the exports if needed, else
-    /// back to default IREE codegen.
-    StringRef entryPoint = libraryFunc->getLeafReference();
-    Operation *transformRoot = transform::detail::findTransformEntryPoint(
-        variantOp, *transformLibrary, entryPoint);
-    if (!transformRoot) {
-      return;
-    }
-    if (failed(transform::applyTransformNamedSequence(
-            variantOp, transformRoot, *transformLibrary,
-            options.enableExpensiveChecks(true)))) {
-      return signalPassFailure();
-    }
-
-    // Re-retrieve the export ops and mark all exports with unchanged
-    // translation info as un-translated.
-    // TODO: Currently this is the only way to "fall back" to codegen. If the
-    // user wants to do all of codegen themselves they can set a `None`
-    // pipeline.
-    exportOps = getAllEntryPoints(variantOp.getInnerModule());
-    for (auto &it : exportOps) {
-      auto exportOp = it.second;
-      if (getTranslationInfo(exportOp) == translationInfo) {
-        exportOp->removeAttr(kTranslationInfoAttrName);
-      }
+      failedToResolve = true;
     }
   }
 

diff --git a/compiler/src/iree/compiler/Codegen/Common/Passes.h b/compiler/src/iree/compiler/Codegen/Common/Passes.h
index ec5ab2c..0db187d 100644
--- a/compiler/src/iree/compiler/Codegen/Common/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/Common/Passes.h

@@ -252,10 +252,7 @@
 
 /// Create an IREE-specific Transform dialect interpreter pass with all
 /// registrations necessary for IREE.
-std::unique_ptr<Pass> createTransformDialectInterpreterPass(
-    llvm::StringRef transformFileName = llvm::StringRef(),
-    llvm::StringRef debugPayloadRootTag = llvm::StringRef(),
-    llvm::StringRef debugTransformRootTag = llvm::StringRef());
+std::unique_ptr<Pass> createTransformDialectInterpreterPass();
 
 /// Pass to propagate type to avoid generating load/stores of illegal types.
 std::unique_ptr<OperationPass<func::FuncOp>> createTypePropagationPass();

diff --git a/compiler/src/iree/compiler/Codegen/Common/Passes.td b/compiler/src/iree/compiler/Codegen/Common/Passes.td
index 0aefa61..4ca6907 100644
--- a/compiler/src/iree/compiler/Codegen/Common/Passes.td
+++ b/compiler/src/iree/compiler/Codegen/Common/Passes.td

@@ -453,35 +453,18 @@
   let summary = "Pass to apply transform dialect operations.";
   let constructor =
     "mlir::iree_compiler::createTransformDialectInterpreterPass()";
+  let description = [{
+    This pass runs the transform dialect interpreter and applies the named
+    sequence transformation specified by the provided name (defaults to
+    `TransformDialect::kTransformEntryPointSymbolName` (i.e. `__transform_main`)).
+  }];
   let options = [
-    Option<"transformFileName", "transform-file-name", "std::string",
-            /*default=*/"\"\"",
-            "Optional filename containing a transform dialect specification to "
-            "apply. If left empty, the IR is assumed to contain one top-level "
-            "transform dialect operation somewhere in the module.">,
-    ListOption<"transformLibraryPaths",
-           "transform-library-paths",
-           "std::string",
-           "If non-empty, the paths to files containing definitions of "
-           "external symbols referenced in the transform script. "
-           "These definitions will be used to replace declarations.">,
-    Option<"debugPayloadRootTag", "debug-payload-root-tag", "std::string",
-            /*default=*/"\"\"",
-            "Select the operation with 'transform.target_tag' attribute having "
-            "the given value as payload IR root. This allows user control on "
-            "what operation to transform in debug mode, without requiring "
-            "intimate knowledge of the IREE nested pass pipeline.\\n"
-            "If empty (normal operation mode), select the pass anchor "
-            "operation in the IREE pipeline, as the payload IR root.">,
-    Option<"debugTransformRootTag", "debug-transform-root-tag", "std::string",
-            /*default=*/"\"\"",
-            "Select the operation with 'transform.target_tag' attribute having "
-            "the given value as container IR for top-level transform ops. This "
-            "allows user control on what transformation to apply in debug "
-            "mode, without requiring intimate knowledge of the IREE nested "
-            "pass pipeline.\\n"
-            "If empty (normal operation mode), select the container of the "
-            "top-level transform op.">
+    Option<"entryPoint", "entry-point", "std::string",
+           /*default=*/[{"transform::TransformDialect::kTransformEntryPointSymbolName.str()"}],
+           "Entry point of the pass pipeline.">,
+    Option<"libraryFileName", "library-file-name", "std::string",
+           /*default=*/[{""}], 
+           "File path to load a library of transform dialect strategies from.">,
   ];
 }
 

diff --git a/compiler/src/iree/compiler/Codegen/Common/TransformDialectInterpreterPass.cpp b/compiler/src/iree/compiler/Codegen/Common/TransformDialectInterpreterPass.cpp
index 1b95c7d..237dc20 100644
--- a/compiler/src/iree/compiler/Codegen/Common/TransformDialectInterpreterPass.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/TransformDialectInterpreterPass.cpp

@@ -6,6 +6,7 @@
 
 #include "iree/compiler/Codegen/Common/PassDetail.h"
 #include "iree/compiler/Codegen/Common/Passes.h"
+#include "iree/compiler/Codegen/Dialect/IREECodegenDialect.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
 #include "mlir/Dialect/Transform/Transforms/TransformInterpreterPassBase.h"
 #include "mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h"
@@ -20,62 +21,66 @@
 /// This needs to be its own pass because the registration mechanism and ops
 /// available are different than for other interpreters.
 class TransformDialectInterpreterPass
-    : public mlir::transform::TransformInterpreterPassBase<
-          TransformDialectInterpreterPass,
-          iree_compiler::TransformDialectInterpreterBase> {
+    : public iree_compiler::TransformDialectInterpreterBase<
+          TransformDialectInterpreterPass> {
 public:
+  TransformDialectInterpreterPass(StringRef libraryFileName = StringRef(),
+                                  StringRef entryPoint = StringRef()) {
+    this->libraryFileName = libraryFileName.str();
+    this->entryPoint = entryPoint.str();
+  }
+
   void getDependentDialects(DialectRegistry &registry) const override {
     mlir::iree_compiler::registerTransformDialectTranslationDependentDialects(
         registry);
   }
 
-  // We don't register libraries here because we expect them to be pre-loaded
-  // much earlier on in the compiler pipeline.
-  TransformDialectInterpreterPass(
-      StringRef transformFileName = StringRef(),
-      StringRef debugPayloadRootTag = StringRef(),
-      StringRef debugTransformRootTag = StringRef()) {
-    this->transformFileName = transformFileName.str();
-    this->debugPayloadRootTag = debugPayloadRootTag.str();
-    this->debugTransformRootTag = debugTransformRootTag.str();
+  void runOnOperation() override {
+    MLIRContext *context = &getContext();
+    transform::TransformOptions options;
+    if (entryPoint.empty()) {
+      entryPoint =
+          transform::TransformDialect::kTransformEntryPointSymbolName.str();
+    }
+    auto dialect = context->getOrLoadDialect<
+        mlir::iree_compiler::IREE::Codegen::IREECodegenDialect>();
+    FailureOr<ModuleOp> maybeTransformLibrary;
+    if (!libraryFileName.empty()) {
+      maybeTransformLibrary =
+          dialect->getOrLoadTransformLibraryModule(libraryFileName);
+    }
+
+    Operation *payloadRoot = getOperation();
+    ModuleOp transformModule =
+        succeeded(maybeTransformLibrary) ? *maybeTransformLibrary : ModuleOp();
+    Operation *transformEntryPoint = transform::detail::findTransformEntryPoint(
+        getOperation(), transformModule, entryPoint);
+    if (!transformEntryPoint) {
+      Operation *transformModuleOrPayloadRoot =
+          transformModule ? transformModule : payloadRoot;
+      transformModuleOrPayloadRoot->emitError()
+          << "failed to find transform entry point '" << entryPoint << "'";
+      return signalPassFailure();
+    }
+    if (failed(transform::applyTransformNamedSequence(
+            payloadRoot, transformEntryPoint, transformModule,
+            options.enableExpensiveChecks(true))))
+      return signalPassFailure();
   }
-  TransformDialectInterpreterPass(const TransformDialectInterpreterPass &pass) =
-      default;
 };
 } // namespace
 
 namespace mlir {
 namespace iree_compiler {
 
-extern llvm::cl::opt<std::string> clCodegenTransformDialectTestName;
-static llvm::cl::opt<std::string> clCodegenTransformDialectDebugPayloadTag(
-    "iree-codegen-transform-dialect-debug-payload-tag",
-    llvm::cl::desc("tag attribute value for the transform dialect interpreter "
-                   "payload root operation"),
-    llvm::cl::init(""));
-static llvm::cl::opt<std::string> clCodegenTransformDialectDebugTransformTag(
-    "iree-codegen-transform-dialect-debug-transform-tag",
-    llvm::cl::desc(
-        "tag attribute value for the transform dialect transform op container"),
-    llvm::cl::init(""));
+extern llvm::cl::opt<std::string> clCodegenTransformDialectStrategyName;
+extern llvm::cl::opt<std::string> clCodegenTransformDialectLibraryFileName;
 
 /// Create a Transform dialect interpreter pass.
-std::unique_ptr<Pass>
-createTransformDialectInterpreterPass(llvm::StringRef transformFileName,
-                                      llvm::StringRef debugPayloadRootTag,
-                                      llvm::StringRef debugTransformRootTag) {
-  // If the strategy filename is prefixed with `@`, it refers to a library
-  // call.
-  std::string clFileName = !clCodegenTransformDialectTestName.empty() &&
-                                   clCodegenTransformDialectTestName[0] != '@'
-                               ? clCodegenTransformDialectTestName
-                               : std::string();
+std::unique_ptr<Pass> createTransformDialectInterpreterPass() {
   return std::make_unique<TransformDialectInterpreterPass>(
-      transformFileName.empty() ? clFileName : transformFileName,
-      debugPayloadRootTag.empty() ? clCodegenTransformDialectDebugPayloadTag
-                                  : debugPayloadRootTag,
-      debugTransformRootTag.empty() ? clCodegenTransformDialectDebugTransformTag
-                                    : debugTransformRootTag);
+      clCodegenTransformDialectLibraryFileName,
+      clCodegenTransformDialectStrategyName);
 }
 } // namespace iree_compiler
 } // namespace mlir

diff --git a/compiler/src/iree/compiler/Codegen/Common/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/test/BUILD.bazel
index 82e5a1c..431e851 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Common/test/BUILD.bazel

@@ -60,7 +60,6 @@
             "test_partitionable_loops_interface.mlir",
             "tile_and_distribute_to_workgroups.mlir",
             "transform_buffer_opt.mlir",
-            "transform_dialect_apply_pattern_op.mlir",
             "transform_match_partial_reduction.mlir",
             "transform_ops_invalid.mlir",
             "transpose_canonicalization.mlir",

diff --git a/compiler/src/iree/compiler/Codegen/Common/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/test/CMakeLists.txt
index 8b2e512..3ef3060 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Common/test/CMakeLists.txt

@@ -56,7 +56,6 @@
     "test_partitionable_loops_interface.mlir"
     "tile_and_distribute_to_workgroups.mlir"
     "transform_buffer_opt.mlir"
-    "transform_dialect_apply_pattern_op.mlir"
     "transform_match_partial_reduction.mlir"
     "transform_ops_invalid.mlir"
     "transpose_canonicalization.mlir"

diff --git a/compiler/src/iree/compiler/Codegen/Common/test/batch_matmul_match_spec.mlir b/compiler/src/iree/compiler/Codegen/Common/test/batch_matmul_match_spec.mlir
index 302aabd..815e4f7 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/batch_matmul_match_spec.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/batch_matmul_match_spec.mlir

@@ -1,9 +1,11 @@
 // RUN: iree-opt %s
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  transform.iree.register_match_callbacks
-  %0:2 = transform.iree.match_callback failures(propagate) "batch_matmul"(%arg0) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.iree.emit_remark "fill" at %0#0 : !transform.any_op
-  transform.iree.emit_remark "batch matmul" at %0#1 : !transform.any_op
-}
+module attributes { transform.with_named_sequence } {
+transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
+    transform.iree.register_match_callbacks
+    %0:2 = transform.iree.match_callback failures(propagate) "batch_matmul"(%root) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.iree.emit_remark "fill" at %0#0 : !transform.any_op
+    transform.iree.emit_remark "batch matmul" at %0#1 : !transform.any_op
+    transform.yield
+  } // @__transform_main
+} // module

diff --git a/compiler/src/iree/compiler/Codegen/Common/test/batch_matmuls.mlir b/compiler/src/iree/compiler/Codegen/Common/test/batch_matmuls.mlir
index bfeb9a6..2d91350 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/batch_matmuls.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/batch_matmuls.mlir

@@ -1,4 +1,7 @@
-// RUN: iree-opt %s --iree-transform-dialect-interpreter='transform-file-name=%p/batch_matmul_match_spec.mlir' --split-input-file --verify-diagnostics
+// RUN: iree-opt %s \
+// RUN: --iree-codegen-transform-dialect-library=%p/batch_matmul_match_spec.mlir \
+// RUN: --iree-transform-dialect-interpreter \
+// RUN: --split-input-file --verify-diagnostics
 
 !lhs = tensor<128x80x32xf32>
 !rhs = tensor<128x32x320xf32>

diff --git a/compiler/src/iree/compiler/Codegen/Common/test/convolution_match_spec.mlir b/compiler/src/iree/compiler/Codegen/Common/test/convolution_match_spec.mlir
index 52ea94b..898b091 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/convolution_match_spec.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/convolution_match_spec.mlir

@@ -1,14 +1,16 @@
 // RUN: iree-opt %s
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  transform.iree.register_match_callbacks
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
+    transform.iree.register_match_callbacks
 
-  %fill, %convolution, %trailing =
-    transform.iree.match_callback failures(propagate) "convolution"(%arg0)
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+    %fill, %convolution, %trailing =
+      transform.iree.match_callback failures(propagate) "convolution"(%root)
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
 
-  transform.iree.emit_remark "fill" at %fill : !transform.any_op
-  transform.iree.emit_remark "convolution" at %convolution : !transform.any_op
-  transform.iree.emit_remark "trailing" at %trailing : !transform.any_op
-}
+    transform.iree.emit_remark "fill" at %fill : !transform.any_op
+    transform.iree.emit_remark "convolution" at %convolution : !transform.any_op
+    transform.iree.emit_remark "trailing" at %trailing : !transform.any_op
+    transform.yield
+  } // @__transform_main
+} // module

diff --git a/compiler/src/iree/compiler/Codegen/Common/test/convolutions.mlir b/compiler/src/iree/compiler/Codegen/Common/test/convolutions.mlir
index 5a724b2..be4bb2f 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/convolutions.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/convolutions.mlir

@@ -1,4 +1,8 @@
-// RUN: iree-opt %s --iree-transform-dialect-interpreter='transform-file-name=%p/convolution_match_spec.mlir' --split-input-file --verify-diagnostics
+// RUN: iree-opt %s \
+// RUN: --iree-codegen-transform-dialect-library=%p/convolution_match_spec.mlir \
+// RUN: --iree-transform-dialect-interpreter \
+// RUN: --split-input-file --verify-diagnostics
+
 
 !input_tensor_t = tensor<2x16x130x130xf32>
 !weight_tensor_t = tensor<32x16x3x3xf32>

diff --git a/compiler/src/iree/compiler/Codegen/Common/test/reductions.mlir b/compiler/src/iree/compiler/Codegen/Common/test/reductions.mlir
index 289bb02..3459164 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/reductions.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/reductions.mlir

@@ -1,5 +1,5 @@
-// RUN: iree-opt %s --iree-transform-dialect-interpreter='transform-file-name=%p/reductions_codegen_spec.mlir' --split-input-file | FileCheck %s
-// RUN: iree-opt %s --iree-transform-dialect-interpreter='transform-file-name=%p/reductions_match_spec.mlir' --split-input-file --verify-diagnostics
+// RUN: iree-opt %s --iree-codegen-transform-dialect-library=%p/reductions_codegen_spec.mlir --iree-transform-dialect-interpreter --split-input-file | FileCheck %s
+// RUN: iree-opt %s --iree-codegen-transform-dialect-library=%p/reductions_match_spec.mlir --iree-transform-dialect-interpreter --split-input-file --verify-diagnostics
 
 // Check that the same transform script applies to reductions with optional
 // leading and trailing elementwise operations, potentially reordered

diff --git a/compiler/src/iree/compiler/Codegen/Common/test/reductions_codegen_spec.mlir b/compiler/src/iree/compiler/Codegen/Common/test/reductions_codegen_spec.mlir
index 93e5e39..d2a67aa 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/reductions_codegen_spec.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/reductions_codegen_spec.mlir

@@ -1,75 +1,77 @@
 // RUN: iree-opt %s
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  transform.iree.register_match_callbacks
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
+    transform.iree.register_match_callbacks
 
-  %maybe_leading, %original_fill, %reduction, %maybe_trailing_0 =
-    transform.iree.match_callback failures(propagate) "reduction"(%arg0)
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-  
-  %_, %more_parallel_fill, %parallel_reduction, %combiner_op =
-    transform.structured.split_reduction %reduction { split_factor = 2, insert_split_dimension = 1 }
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+    %maybe_leading, %original_fill, %reduction, %maybe_trailing_0 =
+      transform.iree.match_callback failures(propagate) "reduction"(%root)
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+    
+    %_, %more_parallel_fill, %parallel_reduction, %combiner_op =
+      transform.structured.split_reduction %reduction { split_factor = 2, insert_split_dimension = 1 }
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
 
-  // Step 1. Map to a single block by tiling with size 1 and fusing.
-  %fusion_root_1, %fusion_group_1 = transform.iree.take_first %maybe_trailing_0, %combiner_op
-    : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %outer_tiled, %grid_loop = transform.structured.tile_using_forall %fusion_root_1 tile_sizes [1]
-    ( mapping = [#gpu.block<x>] )
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  
-  %func = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %func {
-    transform.apply_patterns.iree.bubble_expand
-  } : !transform.any_op
+    // Step 1. Map to a single block by tiling with size 1 and fusing.
+    %fusion_root_1, %fusion_group_1 = transform.iree.take_first %maybe_trailing_0, %combiner_op
+      : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %outer_tiled, %grid_loop = transform.structured.tile_using_forall %fusion_root_1 tile_sizes [1]
+      ( mapping = [#gpu.block<x>] )
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    
+    %func = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.iree.bubble_expand
+    } : !transform.any_op
 
-  // Excessively eager canonicalization results in `fill`s being "fused" due to
-  // swapping with `extract_slice`, which confuses the fusion operation below.
-  // Wrap fusion into a non-canonicalized sequence.
-  %fused_2, %parallel_reduction_2, %more_parallel_fill_2, %original_fill_2, %maybe_leading_2 =
-    transform.sequence %arg0 : !transform.any_op -> !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op
-    failures(propagate) {
-  ^bb1(%arg1: !transform.any_op):
-    %fused_22, %new_containing_1 = transform.structured.fuse_into_containing_op %fusion_group_1 into %grid_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-    %parallel_reduction_22, %new_containing_2 = transform.structured.fuse_into_containing_op %parallel_reduction into %grid_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-    %more_parallel_fill_22, %new_containing_3 = transform.structured.fuse_into_containing_op %more_parallel_fill into %grid_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-    %original_fill_22, %new_containing_4 = transform.structured.fuse_into_containing_op %original_fill into %grid_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-    %maybe_leading_22, %new_containing_5 = transform.structured.fuse_into_containing_op %maybe_leading into %grid_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    // Excessively eager canonicalization results in `fill`s being "fused" due to
+    // swapping with `extract_slice`, which confuses the fusion operation below.
+    // Wrap fusion into a non-canonicalized sequence.
+    %fused_2, %parallel_reduction_2, %more_parallel_fill_2, %original_fill_2, %maybe_leading_2 =
+      transform.sequence %root : !transform.any_op -> !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op
+      failures(propagate) {
+    ^bb1(%arg1: !transform.any_op):
+      %fused_22, %new_containing_1 = transform.structured.fuse_into_containing_op %fusion_group_1 into %grid_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+      %parallel_reduction_22, %new_containing_2 = transform.structured.fuse_into_containing_op %parallel_reduction into %grid_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+      %more_parallel_fill_22, %new_containing_3 = transform.structured.fuse_into_containing_op %more_parallel_fill into %grid_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+      %original_fill_22, %new_containing_4 = transform.structured.fuse_into_containing_op %original_fill into %grid_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+      %maybe_leading_22, %new_containing_5 = transform.structured.fuse_into_containing_op %maybe_leading into %grid_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
 
-    transform.yield %fused_22, %parallel_reduction_22, %more_parallel_fill_22, %original_fill_22, %maybe_leading_22
-      : !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op
-  }
+      transform.yield %fused_22, %parallel_reduction_22, %more_parallel_fill_22, %original_fill_22, %maybe_leading_22
+        : !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op
+    }
 
-  // Step 2. Map reduction to thread X and parallel dimension to other threads.
-  // ===========================================================================
-  %fusion_group_22_full = transform.merge_handles %fused_2, %original_fill_2
-    : !transform.any_op
-  %fusion_root_22_tiled, %block_loop_22 =
-    transform.structured.tile_using_forall %outer_tiled
-    tile_sizes [1] ( mapping = [#gpu.thread<z>] )
-     : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.structured.fuse_into_containing_op %fusion_group_22_full into %block_loop_22 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  
+    // Step 2. Map reduction to thread X and parallel dimension to other threads.
+    // ===========================================================================
+    %fusion_group_22_full = transform.merge_handles %fused_2, %original_fill_2
+      : !transform.any_op
+    %fusion_root_22_tiled, %block_loop_22 =
+      transform.structured.tile_using_forall %outer_tiled
+      tile_sizes [1] ( mapping = [#gpu.thread<z>] )
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.structured.fuse_into_containing_op %fusion_group_22_full into %block_loop_22 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    
 
-  %fusion_group_21 = transform.merge_handles %maybe_leading_2, %more_parallel_fill_2
-    : !transform.any_op
-  %fusion_root_21_tiled, %block_loop_21 =
-    transform.structured.tile_using_forall %parallel_reduction_2
-    tile_sizes [1, 1] ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.structured.fuse_into_containing_op %fusion_group_21 into %block_loop_21 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  
-  // Step 3. Rank-reduce.
-  // ===========================================================================
-  transform.apply_patterns to %func {
-    transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-    transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-    transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-  } : !transform.any_op
+    %fusion_group_21 = transform.merge_handles %maybe_leading_2, %more_parallel_fill_2
+      : !transform.any_op
+    %fusion_root_21_tiled, %block_loop_21 =
+      transform.structured.tile_using_forall %parallel_reduction_2
+      tile_sizes [1, 1] ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.structured.fuse_into_containing_op %fusion_group_21 into %block_loop_21 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    
+    // Step 3. Rank-reduce.
+    // ===========================================================================
+    transform.apply_patterns to %func {
+      transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+      transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+      transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+    } : !transform.any_op
 
-  // We don't perform any following transformation (vectorization, bufferizaton,
-  // mapping) because this schedule is applied to Linalg-only code without the
-  // surrounding context and because it would make it difficult to detect, e.g.,
-  // lack of fusion.
-}
+    // We don't perform any following transformation (vectorization, bufferizaton,
+    // mapping) because this schedule is applied to Linalg-only code without the
+    // surrounding context and because it would make it difficult to detect, e.g.,
+    // lack of fusion.
+    transform.yield
+  } // @__transform_main
+} // module

diff --git a/compiler/src/iree/compiler/Codegen/Common/test/reductions_match_spec.mlir b/compiler/src/iree/compiler/Codegen/Common/test/reductions_match_spec.mlir
index 3de0a24..7f19631 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/reductions_match_spec.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/reductions_match_spec.mlir

@@ -1,15 +1,17 @@
 // RUN: iree-opt %s
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  transform.iree.register_match_callbacks
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
+    transform.iree.register_match_callbacks
 
-  %leading, %fill, %reduction, %trailing =
-    transform.iree.match_callback failures(propagate) "reduction"(%arg0)
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+    %leading, %fill, %reduction, %trailing =
+      transform.iree.match_callback failures(propagate) "reduction"(%root)
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
 
-  transform.iree.emit_remark "leading" at %leading : !transform.any_op
-  transform.iree.emit_remark "fill" at %fill : !transform.any_op
-  transform.iree.emit_remark "reduction" at %reduction : !transform.any_op
-  transform.iree.emit_remark "trailing" at %trailing : !transform.any_op
-}
+    transform.iree.emit_remark "leading" at %leading : !transform.any_op
+    transform.iree.emit_remark "fill" at %fill : !transform.any_op
+    transform.iree.emit_remark "reduction" at %reduction : !transform.any_op
+    transform.iree.emit_remark "trailing" at %trailing : !transform.any_op
+    transform.yield
+  } // @__transform_main
+} // module

diff --git a/compiler/src/iree/compiler/Codegen/Common/test/repeated_matcher_use.mlir b/compiler/src/iree/compiler/Codegen/Common/test/repeated_matcher_use.mlir
index 783ad3d..3193304 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/repeated_matcher_use.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/repeated_matcher_use.mlir

@@ -1,239 +1,241 @@
-// RUN: iree-opt %s --iree-transform-dialect-interpreter --verify-diagnostics --split-input-file
+// RUN: iree-opt %s \
+// RUN: --iree-transform-dialect-interpreter \
+// RUN: --split-input-file --verify-diagnostics
 
-module {
-  transform.sequence failures(propagate) {
-  ^bb0(%arg0: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
     transform.iree.register_match_callbacks
 
     %first, %second =
-      transform.iree.match_callback failures(propagate) "_test_repeated_matcher_use_callback"(%arg0)
+      transform.iree.match_callback failures(propagate) "_test_repeated_matcher_use_callback"(%root)
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
     transform.iree.emit_remark "first" at %first : !transform.any_op
     transform.iree.emit_remark "second" at %second : !transform.any_op
-  }
+    transform.yield
+  } // @__transform_main
+} // module
 
-  module {
-    func.func private @f1(f32) -> f32
-    func.func private @f2(f32, f32) -> f32
+module {
+  func.func private @f1(f32) -> f32
+  func.func private @f2(f32, f32) -> f32
 
-    func.func @foo() -> tensor<10xf32> {
-      %dummy1 = tensor.empty() : tensor<10xf32>
-      %dummy2 = tensor.empty() : tensor<10xf32>
-      %dummy3 = tensor.empty() : tensor<10xf32>
-      %c0 = arith.constant 0.0 : f32
-      %operand = linalg.fill ins(%c0 : f32) outs(%dummy1 : tensor<10xf32>) -> tensor<10xf32>
-      
-      // expected-remark @below {{first}}
-      %first = linalg.generic {
-        indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
-        iterator_types = ["parallel"]
-      } ins(%operand : tensor<10xf32>)
-        outs(%dummy2 : tensor<10xf32>) {
-      ^bb0(%arg0: f32, %arg1: f32):
-        %0 = func.call @f1(%arg0) : (f32) -> f32
-        linalg.yield %0 : f32
-      } -> tensor<10xf32>
+  func.func @foo() -> tensor<10xf32> {
+    %dummy1 = tensor.empty() : tensor<10xf32>
+    %dummy2 = tensor.empty() : tensor<10xf32>
+    %dummy3 = tensor.empty() : tensor<10xf32>
+    %c0 = arith.constant 0.0 : f32
+    %operand = linalg.fill ins(%c0 : f32) outs(%dummy1 : tensor<10xf32>) -> tensor<10xf32>
+    
+    // expected-remark @below {{first}}
+    %first = linalg.generic {
+      indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
+      iterator_types = ["parallel"]
+    } ins(%operand : tensor<10xf32>)
+      outs(%dummy2 : tensor<10xf32>) {
+    ^bb0(%arg0: f32, %arg1: f32):
+      %0 = func.call @f1(%arg0) : (f32) -> f32
+      linalg.yield %0 : f32
+    } -> tensor<10xf32>
 
-      // expected-remark @below {{second}}
-      %second = linalg.generic {
-        indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
-        iterator_types = ["parallel"]
-      } ins(%operand, %first : tensor<10xf32>, tensor<10xf32>)
-        outs(%dummy3 : tensor<10xf32>) {
-      ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
-        %0 = func.call @f2(%arg0, %arg1) : (f32, f32) -> f32
-        linalg.yield %0 : f32
-      } -> tensor<10xf32>
-      return %second : tensor<10xf32>
-    }
+    // expected-remark @below {{second}}
+    %second = linalg.generic {
+      indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
+      iterator_types = ["parallel"]
+    } ins(%operand, %first : tensor<10xf32>, tensor<10xf32>)
+      outs(%dummy3 : tensor<10xf32>) {
+    ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
+      %0 = func.call @f2(%arg0, %arg1) : (f32, f32) -> f32
+      linalg.yield %0 : f32
+    } -> tensor<10xf32>
+    return %second : tensor<10xf32>
   }
 }
 
 // -----
 
-module {
-  transform.sequence failures(propagate) {
-  ^bb0(%arg0: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
     transform.iree.register_match_callbacks
 
     // expected-error @+2 {{failed to match}}
     %first, %second =
-      transform.iree.match_callback failures(propagate) "_test_repeated_matcher_use_callback"(%arg0)
+      transform.iree.match_callback failures(propagate) "_test_repeated_matcher_use_callback"(%root)
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
     transform.iree.emit_remark "first" at %first : !transform.any_op
     transform.iree.emit_remark "second" at %second : !transform.any_op
-  }
+    transform.yield
+  } // @__transform_main
+} // module
 
-  module {
-    func.func private @f1(f32) -> f32
-    func.func private @f2(f32, f32) -> f32
+module {
+  func.func private @f1(f32) -> f32
+  func.func private @f2(f32, f32) -> f32
 
-    func.func @foo() -> tensor<10xf32> {
-      %dummy1 = tensor.empty() : tensor<10xf32>
-      %dummy2 = tensor.empty() : tensor<10xf32>
-      %dummy3 = tensor.empty() : tensor<10xf32>
-      %dummy5 = tensor.empty() : tensor<10xf32>
-      %c0 = arith.constant 0.0 : f32
-      %c5 = arith.constant 5.0 : f32
-      %operand5 = linalg.fill ins(%c5 : f32) outs(%dummy5 : tensor<10xf32>) -> tensor<10xf32>
-      %operand = linalg.fill ins(%c0 : f32) outs(%dummy1 : tensor<10xf32>) -> tensor<10xf32>
-      
-      %first = linalg.generic {
-        indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
-        iterator_types = ["parallel"]
-      } ins(%operand : tensor<10xf32>)
-        outs(%dummy2 : tensor<10xf32>) {
-      ^bb0(%arg0: f32, %arg1: f32):
-        %0 = func.call @f1(%arg0) : (f32) -> f32
-        linalg.yield %0 : f32
-      } -> tensor<10xf32>
+  func.func @foo() -> tensor<10xf32> {
+    %dummy1 = tensor.empty() : tensor<10xf32>
+    %dummy2 = tensor.empty() : tensor<10xf32>
+    %dummy3 = tensor.empty() : tensor<10xf32>
+    %dummy5 = tensor.empty() : tensor<10xf32>
+    %c0 = arith.constant 0.0 : f32
+    %c5 = arith.constant 5.0 : f32
+    %operand5 = linalg.fill ins(%c5 : f32) outs(%dummy5 : tensor<10xf32>) -> tensor<10xf32>
+    %operand = linalg.fill ins(%c0 : f32) outs(%dummy1 : tensor<10xf32>) -> tensor<10xf32>
+    
+    %first = linalg.generic {
+      indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
+      iterator_types = ["parallel"]
+    } ins(%operand : tensor<10xf32>)
+      outs(%dummy2 : tensor<10xf32>) {
+    ^bb0(%arg0: f32, %arg1: f32):
+      %0 = func.call @f1(%arg0) : (f32) -> f32
+      linalg.yield %0 : f32
+    } -> tensor<10xf32>
 
-      %second = linalg.generic {
-        indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
-        iterator_types = ["parallel"]
-      } ins(%operand5, %first : tensor<10xf32>, tensor<10xf32>)
-        outs(%dummy3 : tensor<10xf32>) {
-      ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
-        %0 = func.call @f2(%arg0, %arg1) : (f32, f32) -> f32
-        linalg.yield %0 : f32
-      } -> tensor<10xf32>
-      return %second : tensor<10xf32>
-    }
+    %second = linalg.generic {
+      indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
+      iterator_types = ["parallel"]
+    } ins(%operand5, %first : tensor<10xf32>, tensor<10xf32>)
+      outs(%dummy3 : tensor<10xf32>) {
+    ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
+      %0 = func.call @f2(%arg0, %arg1) : (f32, f32) -> f32
+      linalg.yield %0 : f32
+    } -> tensor<10xf32>
+    return %second : tensor<10xf32>
   }
 }
 
 // -----
 
-module {
-  transform.sequence failures(propagate) {
-  ^bb0(%arg0: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
     transform.iree.register_match_callbacks
 
     // expected-error @+2 {{failed to match}}
     %first, %second =
-      transform.iree.match_callback failures(propagate) "_test_repeated_matcher_use_callback"(%arg0)
+      transform.iree.match_callback failures(propagate) "_test_repeated_matcher_use_callback"(%root)
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
     transform.iree.emit_remark "first" at %first : !transform.any_op
     transform.iree.emit_remark "second" at %second : !transform.any_op
-  }
+    transform.yield
+  } // @__transform_main
+} // module
 
-  module {
-    func.func private @f1(f32) -> f32
-    func.func private @f2(f32, f32) -> f32
+module {
+  func.func private @f1(f32) -> f32
+  func.func private @f2(f32, f32) -> f32
 
-    func.func @foo() -> tensor<10xf32> {
-      %dummy1 = tensor.empty() : tensor<10xf32>
-      %dummy2 = tensor.empty() : tensor<10xf32>
-      %dummy3 = tensor.empty() : tensor<10xf32>
-      %dummy5 = tensor.empty() : tensor<10xf32>
-      %c0 = arith.constant 0.0 : f32
-      %operand = linalg.fill ins(%c0 : f32) outs(%dummy1 : tensor<10xf32>) -> tensor<10xf32>
-      
-      %first = linalg.generic {
-        indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
-        iterator_types = ["parallel"]
-      } ins(%operand : tensor<10xf32>)
-        outs(%dummy2 : tensor<10xf32>) {
-      ^bb0(%arg0: f32, %arg1: f32):
-        %0 = func.call @f1(%arg0) : (f32) -> f32
-        linalg.yield %0 : f32
-      } -> tensor<10xf32>
+  func.func @foo() -> tensor<10xf32> {
+    %dummy1 = tensor.empty() : tensor<10xf32>
+    %dummy2 = tensor.empty() : tensor<10xf32>
+    %dummy3 = tensor.empty() : tensor<10xf32>
+    %dummy5 = tensor.empty() : tensor<10xf32>
+    %c0 = arith.constant 0.0 : f32
+    %operand = linalg.fill ins(%c0 : f32) outs(%dummy1 : tensor<10xf32>) -> tensor<10xf32>
+    
+    %first = linalg.generic {
+      indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
+      iterator_types = ["parallel"]
+    } ins(%operand : tensor<10xf32>)
+      outs(%dummy2 : tensor<10xf32>) {
+    ^bb0(%arg0: f32, %arg1: f32):
+      %0 = func.call @f1(%arg0) : (f32) -> f32
+      linalg.yield %0 : f32
+    } -> tensor<10xf32>
 
-      %second = linalg.generic {
-        indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
-        iterator_types = ["parallel"]
-      } ins(%first, %first : tensor<10xf32>, tensor<10xf32>)
-        outs(%dummy3 : tensor<10xf32>) {
-      ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
-        %0 = func.call @f2(%arg0, %arg1) : (f32, f32) -> f32
-        linalg.yield %0 : f32
-      } -> tensor<10xf32>
-      return %second : tensor<10xf32>
-    }
+    %second = linalg.generic {
+      indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
+      iterator_types = ["parallel"]
+    } ins(%first, %first : tensor<10xf32>, tensor<10xf32>)
+      outs(%dummy3 : tensor<10xf32>) {
+    ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
+      %0 = func.call @f2(%arg0, %arg1) : (f32, f32) -> f32
+      linalg.yield %0 : f32
+    } -> tensor<10xf32>
+    return %second : tensor<10xf32>
   }
 }
 
 // -----
 
-module {
-  transform.sequence failures(propagate) {
-  ^bb0(%arg0: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
     transform.iree.register_match_callbacks
 
     %first, %second =
-      transform.iree.match_callback failures(propagate) "_test_value_matcher_callback"(%arg0)
+      transform.iree.match_callback failures(propagate) "_test_value_matcher_callback"(%root)
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
     transform.iree.emit_remark "first" at %first : !transform.any_op
     transform.iree.emit_remark "second" at %second : !transform.any_op
-  }
+    transform.yield
+  } // @__transform_main
+} // module
 
-  module {
-    func.func private @f1(f32) -> f32
-    func.func private @f2(f32, f32) -> f32
+module {
+  func.func private @f1(f32) -> f32
+  func.func private @f2(f32, f32) -> f32
 
-    func.func @foo() -> tensor<10xf32> {
-      %dummy1 = tensor.empty() : tensor<10xf32>
-      %dummy2 = tensor.empty() : tensor<10xf32>
-      %dummy3 = tensor.empty() : tensor<10xf32>
-      %operand = tensor.empty() : tensor<10xf32>
+  func.func @foo() -> tensor<10xf32> {
+    %dummy1 = tensor.empty() : tensor<10xf32>
+    %dummy2 = tensor.empty() : tensor<10xf32>
+    %dummy3 = tensor.empty() : tensor<10xf32>
+    %operand = tensor.empty() : tensor<10xf32>
 
-      // expected-remark @below {{first}}
-      %first = linalg.generic {
-        indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
-        iterator_types = ["parallel"]
-      } ins(%operand : tensor<10xf32>)
-        outs(%dummy2 : tensor<10xf32>) {
-      ^bb0(%arg0: f32, %arg1: f32):
-        %0 = func.call @f1(%arg0) : (f32) -> f32
-        linalg.yield %0 : f32
-      } -> tensor<10xf32>
+    // expected-remark @below {{first}}
+    %first = linalg.generic {
+      indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
+      iterator_types = ["parallel"]
+    } ins(%operand : tensor<10xf32>)
+      outs(%dummy2 : tensor<10xf32>) {
+    ^bb0(%arg0: f32, %arg1: f32):
+      %0 = func.call @f1(%arg0) : (f32) -> f32
+      linalg.yield %0 : f32
+    } -> tensor<10xf32>
 
-      // expected-remark @below {{second}}
-      %second = linalg.generic {
-        indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
-        iterator_types = ["parallel"]
-      } ins(%operand, %first : tensor<10xf32>, tensor<10xf32>)
-        outs(%dummy3 : tensor<10xf32>) {
-      ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
-        %0 = func.call @f2(%arg0, %arg1) : (f32, f32) -> f32
-        linalg.yield %0 : f32
-      } -> tensor<10xf32>
-      return %second : tensor<10xf32>
-    }
+    // expected-remark @below {{second}}
+    %second = linalg.generic {
+      indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
+      iterator_types = ["parallel"]
+    } ins(%operand, %first : tensor<10xf32>, tensor<10xf32>)
+      outs(%dummy3 : tensor<10xf32>) {
+    ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
+      %0 = func.call @f2(%arg0, %arg1) : (f32, f32) -> f32
+      linalg.yield %0 : f32
+    } -> tensor<10xf32>
+    return %second : tensor<10xf32>
   }
 }
 
 // -----
 
-module {
-  transform.sequence failures(propagate) {
-  ^bb0(%arg0: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
     transform.iree.register_match_callbacks
 
-    %0 = transform.iree.match_callback failures(propagate) "_test_shaped_value_matcher_callback"(%arg0)
+    %0 = transform.iree.match_callback failures(propagate) "_test_shaped_value_matcher_callback"(%root)
       : (!transform.any_op) -> !transform.any_op
     transform.iree.emit_remark "matched" at %0 : !transform.any_op
-  }
+    transform.yield
+  } // @__transform_main
+} // module
 
-  module {
-    func.func @foo(%arg0: tensor<42x10xf32>) -> tensor<10x42xf32> {
-      %init = tensor.empty() : tensor<10x42xf32>
-      // expected-remark @below {{rank: 2}}
-      // expected-remark @below {{dimensions: 10, 42}}
-      // expected-remark @below {{matched}}
-      %0 = linalg.generic {
-        indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>],
-        iterator_types = ["parallel", "parallel"]
-      } ins(%arg0: tensor<42x10xf32>) 
-       outs(%init: tensor<10x42xf32>) {
-      ^bb0(%arg1: f32, %arg2: f32):
-        linalg.yield %arg1 : f32
-      } -> tensor<10x42xf32>
-      return %0 : tensor<10x42xf32>
-    }
+module {
+  func.func @foo(%arg0: tensor<42x10xf32>) -> tensor<10x42xf32> {
+    %init = tensor.empty() : tensor<10x42xf32>
+    // expected-remark @below {{rank: 2}}
+    // expected-remark @below {{dimensions: 10, 42}}
+    // expected-remark @below {{matched}}
+    %0 = linalg.generic {
+      indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>],
+      iterator_types = ["parallel", "parallel"]
+    } ins(%arg0: tensor<42x10xf32>) 
+      outs(%init: tensor<10x42xf32>) {
+    ^bb0(%arg1: f32, %arg2: f32):
+      linalg.yield %arg1 : f32
+    } -> tensor<10x42xf32>
+    return %0 : tensor<10x42xf32>
   }
 }

diff --git a/compiler/src/iree/compiler/Codegen/Common/test/transform_buffer_opt.mlir b/compiler/src/iree/compiler/Codegen/Common/test/transform_buffer_opt.mlir
index a4e1fc4..082d851 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/transform_buffer_opt.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/transform_buffer_opt.mlir

@@ -15,8 +15,10 @@
   return %r : vector<4xf32>
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.memref.erase_dead_alloc_and_stores %0 : (!transform.any_op) -> ()
-}
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.any_op
+    transform.memref.erase_dead_alloc_and_stores %0 : (!transform.any_op) -> ()
+    transform.yield
+  } // @__transform_main
+} // module

diff --git a/compiler/src/iree/compiler/Codegen/Common/test/transform_dialect_apply_pattern_op.mlir b/compiler/src/iree/compiler/Codegen/Common/test/transform_dialect_apply_pattern_op.mlir
deleted file mode 100644
index 684e863..0000000
--- a/compiler/src/iree/compiler/Codegen/Common/test/transform_dialect_apply_pattern_op.mlir
+++ /dev/null

@@ -1,261 +0,0 @@
-// RUN: iree-opt %s -iree-transform-dialect-interpreter -transform-dialect-drop-schedule --split-input-file | FileCheck %s
-
-// CHECK-LABEL: @select_cmp_eq_select
-//       CHECK:   return %arg1
-func.func @select_cmp_eq_select(%arg0: i64, %arg1: i64) -> i64 {
-  %0 = arith.cmpi eq, %arg0, %arg1 : i64
-  %1 = arith.select %0, %arg0, %arg1 : i64
-  return %1 : i64
-}
-
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %0 {
-    transform.apply_patterns.canonicalization
-  } : !transform.any_op
-}
-
-// -----
-
-#map0 = affine_map<(d0, d1) -> (d0, d1)>
-#map2 = affine_map<(d0) -> (d0 * 4)>
-
-// CHECK-LABEL: @promote
-func.func @promote() -> (tensor<16x128xf32>) {
-  %c0 = arith.constant 0 : index
-  %f0 = arith.constant 0.000000e+00 : f32
-  %c16 = arith.constant 16 : index
-  %c32 = arith.constant 32 : index
-
-  %empty = tensor.empty() : tensor<16x128xf32>
-  %filled = linalg.fill ins(%f0 : f32) outs(%empty : tensor<16x128xf32>) -> tensor<16x128xf32>
-
-  // CHECK: forall{{.*}}shared_outs(%[[ARG:.*]] =
-  // CHECK:   %[[A:.*]] = tensor.extract_slice %[[ARG]]
-  // CHECK:   %[[B:.*]] = tensor.extract_slice %[[ARG]]
-  // CHECK:   %[[C:.*]] = linalg.generic{{.*}}ins(%[[A]]{{.*}}outs(%[[B]]
-  %10 = scf.forall (%arg0, %arg1) in (%c16, %c32) shared_outs(%arg2 = %filled) -> (tensor<16x128xf32>) {
-    %11 = affine.apply #map2(%arg1)
-    %extracted_slice = tensor.extract_slice %filled[%arg0, %11] [1, 4] [1, 1] : tensor<16x128xf32> to tensor<1x4xf32>
-    %extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %11] [1, 4] [1, 1] : tensor<16x128xf32> to tensor<1x4xf32>
-    %13 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice : tensor<1x4xf32>) outs(%extracted_slice_2 : tensor<1x4xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %res = arith.addf %in, %in: f32
-      linalg.yield %res : f32
-    } -> tensor<1x4xf32>
-    scf.forall.in_parallel {
-      tensor.parallel_insert_slice %13 into %arg2[%arg0, %11] [1, 4] [1, 1] : tensor<1x4xf32> into tensor<16x128xf32>
-    }
-  }
-  return %10 : tensor<16x128xf32>
-}
-
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["scf.forall"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %1 = transform.cast %0 : !transform.any_op to !transform.op<"scf.forall">
-  transform.iree.share_forall_operands %1 share_operands = [0] : (!transform.op<"scf.forall">) -> !transform.op<"scf.forall">
-}
-
-// -----
-
-#map2 = affine_map<(d0, d1) -> (d0, d1)>
-
-func.func private @mutate(f32) -> f32
-
-// CHECK-LABEL: @bubble_up
-func.func @bubble_up(%arg0: tensor<32x64xf32>) -> tensor<32x2x32xf32> {
-  // Check that shape expansion precedes linalg.generic after the patterns were applied.
-  // CHECK: tensor.expand_shape
-  // CHECK: tensor.expand_shape
-  // CHECK: linalg.generic
-  %init = tensor.empty() : tensor<32x64xf32>
-  %result = linalg.generic {
-    indexing_maps = [#map2, #map2],
-    iterator_types = ["parallel", "parallel"]}
-  ins(%arg0: tensor<32x64xf32>) outs(%init: tensor<32x64xf32>) {
-  ^bb0(%arg1: f32, %arg2: f32):
-    %0 = func.call @mutate(%arg1) : (f32) -> f32
-    linalg.yield %0 : f32
-  } -> tensor<32x64xf32>
-  %out = tensor.expand_shape %result[[0], [1, 2]] : tensor<32x64xf32> into tensor<32x2x32xf32>
-  return %out : tensor<32x2x32xf32>
-}
-
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %0 {
-    transform.apply_patterns.iree.bubble_expand
-  } : !transform.any_op
-}
-
-// -----
-
-// CHECK-LABEL: @pad_fill_to_fill
-func.func @pad_fill_to_fill(%arg0: tensor<31x62xf32>) -> tensor<32x64xf32> {
-  // Check that a pad of a fill with the same constant is replaced by a
-  // bigger fill.
-  // CHECK-DAG: %[[FILL_CST:.*]] = arith.constant 0.0{{0*e\+00}} : f32
-  // CHECK-DAG: %[[EMPTY:.*]] = tensor.empty() : tensor<32x64xf32>
-  // CHECK: %[[PADDED_FILL:.*]] = linalg.fill ins(%[[FILL_CST]] : f32) outs(%[[EMPTY]] : tensor<32x64xf32>) -> tensor<32x64xf32>
-  // CHECK: return %[[PADDED_FILL]]
-  %cst = arith.constant 0.0 : f32
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %fill = linalg.fill ins(%cst : f32) outs(%arg0 : tensor<31x62xf32>) -> tensor<31x62xf32>
-  %padded = tensor.pad %fill low[%c0, %c0] high[%c1, %c2] {
-    ^bb0(%arg3: index, %arg4: index):
-      tensor.yield %cst : f32
-  } : tensor<31x62xf32> to tensor<32x64xf32>
-  return %padded : tensor<32x64xf32>
-}
-
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %0 {
-    transform.apply_patterns.iree.fold_fill_into_pad
-    transform.apply_patterns.linalg.tiling_canonicalization
-    transform.apply_patterns.scf.for_loop_canonicalization
-  } : !transform.any_op
-}
-
-// -----
-
-// CHECK-LABEL: @pad_fill_different_ssa_value_but_same_cst
-func.func @pad_fill_different_ssa_value_but_same_cst(%arg0: tensor<31x62xf32>) -> tensor<32x64xf32> {
-  // Check that a pad of a fill with the same constant is replaced by a
-  // bigger fill even when the constant comes from different ssa value.
-  // CHECK-DAG: %[[FILL_CST:.*]] = arith.constant 0.0{{0*e\+00}} : f32
-  // CHECK-DAG: %[[EMPTY:.*]] = tensor.empty() : tensor<32x64xf32>
-  // CHECK: %[[PADDED_FILL:.*]] = linalg.fill ins(%[[FILL_CST]] : f32) outs(%[[EMPTY]] : tensor<32x64xf32>) -> tensor<32x64xf32>
-  // CHECK: return %[[PADDED_FILL]]
-  %cst = arith.constant 0.0 : f32
-  %cst2 = arith.constant 0.0 : f32
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %fill = linalg.fill ins(%cst : f32) outs(%arg0 : tensor<31x62xf32>) -> tensor<31x62xf32>
-  %padded = tensor.pad %fill low[%c0, %c0] high[%c1, %c2] {
-    ^bb0(%arg3: index, %arg4: index):
-      tensor.yield %cst2 : f32
-  } : tensor<31x62xf32> to tensor<32x64xf32>
-  return %padded : tensor<32x64xf32>
-}
-
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %0 {
-    transform.apply_patterns.iree.fold_fill_into_pad
-    transform.apply_patterns.linalg.tiling_canonicalization
-    transform.apply_patterns.scf.for_loop_canonicalization
-  } : !transform.any_op
-}
-
-// -----
-
-// CHECK-LABEL: @pad_extract_fill_to_fill
-func.func @pad_extract_fill_to_fill(%arg0: tensor<31x62xf32>,
-    %size0 : index, %size1 : index,
-    %high0 : index, %high1 : index) -> tensor<32x64xf32> {
-  // Check that a pad of a fill with the same constant is replaced by a
-  // bigger fill even when the fill is hidden behind an extract_slice.
-  // CHECK-DAG: %[[FILL_CST:.*]] = arith.constant 0.0{{0*e\+00}} : f32
-  // CHECK-DAG: %[[EMPTY:.*]] = tensor.empty() : tensor<32x64xf32>
-  // CHECK: %[[PADDED_FILL:.*]] = linalg.fill ins(%[[FILL_CST]] : f32) outs(%[[EMPTY]] : tensor<32x64xf32>) -> tensor<32x64xf32>
-  // CHECK: return %[[PADDED_FILL]]
-  %cst = arith.constant 0.0 : f32
-  %cst2 = arith.constant 0.0 : f32
-  %c0 = arith.constant 0 : index
-  %fill = linalg.fill ins(%cst : f32) outs(%arg0 : tensor<31x62xf32>) -> tensor<31x62xf32>
-  %extracted_slice = tensor.extract_slice %fill[0, 0] [%size0, %size1] [1, 1] : tensor<31x62xf32> to tensor<?x?xf32>
-  %padded = tensor.pad %extracted_slice low[%c0, %c0] high[%high0, %high1] {
-    ^bb0(%arg3: index, %arg4: index):
-      tensor.yield %cst2 : f32
-  } : tensor<?x?xf32> to tensor<32x64xf32>
-  return %padded : tensor<32x64xf32>
-}
-
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %0 {
-    transform.apply_patterns.iree.fold_fill_into_pad
-    transform.apply_patterns.linalg.tiling_canonicalization
-    transform.apply_patterns.scf.for_loop_canonicalization
-  } : !transform.any_op
-}
-
-// -----
-
-// CHECK-LABEL: @pad_extract_extract_fill_to_fill
-func.func @pad_extract_extract_fill_to_fill(%arg0: tensor<31x62xf32>,
-    %size0a : index, %size1a : index,
-    %size0b : index, %size1b : index,
-    %high0 : index, %high1 : index) -> tensor<32x64xf32> {
-  // Check that a pad of a fill with the same constant is replaced by a
-  // bigger fill even when the fill is hidden behind a few `extract_slice`s.
-  // CHECK-DAG: %[[FILL_CST:.*]] = arith.constant 0.0{{0*e\+00}} : f32
-  // CHECK-DAG: %[[EMPTY:.*]] = tensor.empty() : tensor<32x64xf32>
-  // CHECK: %[[PADDED_FILL:.*]] = linalg.fill ins(%[[FILL_CST]] : f32) outs(%[[EMPTY]] : tensor<32x64xf32>) -> tensor<32x64xf32>
-  // CHECK: return %[[PADDED_FILL]]
-  %cst = arith.constant 0.0 : f32
-  %cst2 = arith.constant 0.0 : f32
-  %c0 = arith.constant 0 : index
-  %fill = linalg.fill ins(%cst : f32) outs(%arg0 : tensor<31x62xf32>) -> tensor<31x62xf32>
-  %extracted_sliceA = tensor.extract_slice %fill[0, 0] [%size0a, %size1a] [1, 1] : tensor<31x62xf32> to tensor<?x?xf32>
-  %extracted_sliceB = tensor.extract_slice %extracted_sliceA[0, 0] [%size0b, %size1b] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
-  %padded = tensor.pad %extracted_sliceB low[%c0, %c0] high[%high0, %high1] {
-    ^bb0(%arg3: index, %arg4: index):
-      tensor.yield %cst2 : f32
-  } : tensor<?x?xf32> to tensor<32x64xf32>
-  return %padded : tensor<32x64xf32>
-}
-
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %0 {
-    transform.apply_patterns.iree.fold_fill_into_pad
-    transform.apply_patterns.linalg.tiling_canonicalization
-    transform.apply_patterns.scf.for_loop_canonicalization
-  } : !transform.any_op
-}
-
-// -----
-
-// CHECK-LABEL: @pad_extract_bigger_fill_to_fill
-func.func @pad_extract_bigger_fill_to_fill(%arg0: tensor<253x123xf32>,
-    %size0 : index, %size1 : index,
-    %high0 : index, %high1 : index) -> tensor<32x64xf32> {
-  // Check that a pad of a bigger fill with the same constant is replaced by a
-  // fill of the right size.
-  // CHECK-DAG: %[[FILL_CST:.*]] = arith.constant 0.0{{0*e\+00}} : f32
-  // CHECK-DAG: %[[EMPTY:.*]] = tensor.empty() : tensor<32x64xf32>
-  // CHECK: %[[PADDED_FILL:.*]] = linalg.fill ins(%[[FILL_CST]] : f32) outs(%[[EMPTY]] : tensor<32x64xf32>) -> tensor<32x64xf32>
-  // CHECK: return %[[PADDED_FILL]]
-  %cst = arith.constant 0.0 : f32
-  %cst2 = arith.constant 0.0 : f32
-  %c0 = arith.constant 0 : index
-  %fill = linalg.fill ins(%cst : f32) outs(%arg0 : tensor<253x123xf32>) -> tensor<253x123xf32>
-  %extracted_slice = tensor.extract_slice %fill[0, 0] [%size0, %size1] [1, 1] : tensor<253x123xf32> to tensor<?x?xf32>
-  %padded = tensor.pad %extracted_slice low[%c0, %c0] high[%high0, %high1] {
-    ^bb0(%arg3: index, %arg4: index):
-      tensor.yield %cst2 : f32
-  } : tensor<?x?xf32> to tensor<32x64xf32>
-  return %padded : tensor<32x64xf32>
-}
-
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %0 {
-    transform.apply_patterns.iree.fold_fill_into_pad
-    transform.apply_patterns.linalg.tiling_canonicalization
-    transform.apply_patterns.scf.for_loop_canonicalization
-  } : !transform.any_op
-}

diff --git a/compiler/src/iree/compiler/Codegen/Common/test/transform_match_partial_reduction.mlir b/compiler/src/iree/compiler/Codegen/Common/test/transform_match_partial_reduction.mlir
index 61baa7f..3e4e546 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/transform_match_partial_reduction.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/transform_match_partial_reduction.mlir

@@ -24,20 +24,22 @@
   return %result, %fill2 : tensor<8xf32>, tensor<32xf32>
 }
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  transform.iree.register_match_callbacks
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
+    transform.iree.register_match_callbacks
 
-  %leading, %fill, %reduction, %trailing =
-    transform.iree.match_callback failures(propagate) "reduction_partial"(%arg0)
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+    %leading, %fill, %reduction, %trailing =
+      transform.iree.match_callback failures(propagate) "reduction_partial"(%root)
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
 
-  transform.iree.emit_remark "leading" at %leading : !transform.any_op
-  transform.iree.emit_remark "fill" at %fill : !transform.any_op
-  transform.iree.emit_remark "reduction" at %reduction : !transform.any_op
-  transform.iree.emit_remark "trailing" at %trailing : !transform.any_op
+    transform.iree.emit_remark "leading" at %leading : !transform.any_op
+    transform.iree.emit_remark "fill" at %fill : !transform.any_op
+    transform.iree.emit_remark "reduction" at %reduction : !transform.any_op
+    transform.iree.emit_remark "trailing" at %trailing : !transform.any_op
 
-  // expected-error @below {{failed to match}}
-  transform.iree.match_callback failures(propagate) "reduction"(%arg0)
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-}
+    // expected-error @below {{failed to match}}
+    transform.iree.match_callback failures(propagate) "reduction"(%root)
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+    transform.yield
+  } // @__transform_main
+} // module

diff --git a/compiler/src/iree/compiler/Codegen/Common/test/transform_ops_invalid.mlir b/compiler/src/iree/compiler/Codegen/Common/test/transform_ops_invalid.mlir
index 8a5deaf..e8be084 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/transform_ops_invalid.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/transform_ops_invalid.mlir

@@ -1,75 +1,70 @@
 // RUN: iree-opt %s --split-input-file --iree-transform-dialect-interpreter --verify-diagnostics 
 
-module {
-  transform.sequence failures(propagate) {
-  ^bb0(%arg0: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
     // expected-error @below {{match registry not available}}
     transform.iree.match_callback failures(suppress) "_test_match_callback"() : () -> ()
-  }
-}
+    transform.yield
+  } // @__transform_main
+} // module
+
 
 // -----
 
-module {
-  transform.sequence failures(propagate) {
-  ^bb0(%arg0: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
     transform.iree.register_match_callbacks
     // expected-error @below {{callback '_non_existing_name_' not found in the registry}}
     transform.iree.match_callback failures(suppress) "_non_existing_name_"() : () -> ()
-  }
-}
+    transform.yield
+  } // @__transform_main
+} // module
+
 
 // -----
 
-module {
-  transform.sequence failures(propagate) {
-  ^bb0(%arg0: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
     transform.iree.register_match_callbacks
     // expected-error @below {{callback produced a different number of handles than expected}}
-    transform.iree.match_callback failures(suppress) "_test_match_callback"(%arg0) : (!transform.any_op) -> ()
-  }
-}
+    transform.iree.match_callback failures(suppress) "_test_match_callback"(%root) : (!transform.any_op) -> ()
+    transform.yield
+  } // @__transform_main
+} // module
+
 
 // -----
 
 // Successful match.
-module {
-  transform.sequence failures(propagate) {
-  ^bb0(%arg0: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
     transform.iree.register_match_callbacks
-    transform.iree.match_callback failures(propagate) "_test_match_callback"(%arg0) : (!transform.any_op) -> (!transform.any_op)
-  }
-}
+    transform.iree.match_callback failures(propagate) "_test_match_callback"(%root) : (!transform.any_op) -> (!transform.any_op)
+    transform.yield
+  } // @__transform_main
+} // module
+
 
 // -----
 
-module attributes {test.iree_transform_do_not_match} {
-  transform.sequence failures(propagate) {
-  ^bb0(%arg0: !transform.any_op):
+module attributes { transform.with_named_sequence , test.iree_transform_do_not_match } {
+  transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
     transform.iree.register_match_callbacks
     // expected-error @below {{failed to match}}
-    transform.iree.match_callback failures(propagate) "_test_match_callback"(%arg0) : (!transform.any_op) -> (!transform.any_op)
-  }
-}
+    transform.iree.match_callback failures(propagate) "_test_match_callback"(%root) : (!transform.any_op) -> (!transform.any_op)
+    transform.yield
+  } // @__transform_main
+} // module
+
 
 // -----
 
 // Failed to match, but the op silences such errors.
-module attributes {test.iree_transform_do_not_match} {
-  transform.sequence failures(propagate) {
-  ^bb0(%arg0: !transform.any_op):
+module attributes { transform.with_named_sequence, test.iree_transform_do_not_match } {
+  transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
     transform.iree.register_match_callbacks
-    transform.iree.match_callback failures(suppress) "_test_match_callback"(%arg0) : (!transform.any_op) -> (!transform.any_op)
-  }
-}
+    transform.iree.match_callback failures(suppress) "_test_match_callback"(%root) : (!transform.any_op) -> (!transform.any_op)
+    transform.yield
+  } // @__transform_main
+} // module
 
-// -----
-
-// Failed to match, but the parent sequence silences all errors.
-module attributes {test.iree_transform_do_not_match} {
-  transform.sequence failures(suppress) {
-  ^bb0(%arg0: !transform.any_op):
-    transform.iree.register_match_callbacks
-    transform.iree.match_callback failures(propagate) "_test_match_callback"(%arg0) : (!transform.any_op) -> (!transform.any_op)
-  }
-}

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/BUILD.bazel
index a57d686..9784944 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/BUILD.bazel

@@ -6,8 +6,8 @@
 
 # Tests for common transforms.
 
-load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
 load("//build_tools/bazel:enforce_glob.bzl", "enforce_glob")
+load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
 
 package(
     features = ["layering_check"],
@@ -56,7 +56,6 @@
             "tile.mlir",
             "tile_and_fuse.mlir",
             "transform_dialect_bufferize.mlir",
-            "transform_dialect_iree_tile_to_forall.mlir",
             "transpose_avx2_lowering.mlir",
             "unfused_fma.mlir",
             "vector_contract_to_arm_asm.mlir",

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
index cdc786f..2add8b7 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt

@@ -51,7 +51,6 @@
     "tile.mlir"
     "tile_and_fuse.mlir"
     "transform_dialect_bufferize.mlir"
-    "transform_dialect_iree_tile_to_forall.mlir"
     "transpose_avx2_lowering.mlir"
     "unfused_fma.mlir"
     "vector_contract_to_arm_asm.mlir"

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/transform_dialect_bufferize.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/transform_dialect_bufferize.mlir
index 905690f..273543b 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/transform_dialect_bufferize.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/transform_dialect_bufferize.mlir

@@ -35,9 +35,11 @@
   }
 }
 
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
-  transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
-  %variant_op_3 = transform.iree.bufferize %variant_op : (!transform.any_op) -> (!transform.any_op)
-  %func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-}
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.consumed}) {
+    transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+    %variant_op_3 = transform.iree.bufferize %variant_op : (!transform.any_op) -> (!transform.any_op)
+    %func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    transform.yield 
+  }
+} // module

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/transform_dialect_iree_tile_to_forall.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/transform_dialect_iree_tile_to_forall.mlir
deleted file mode 100644
index ac1f307..0000000
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/transform_dialect_iree_tile_to_forall.mlir
+++ /dev/null

@@ -1,168 +0,0 @@
-// RUN: iree-opt %s -iree-transform-dialect-interpreter -transform-dialect-drop-schedule --cse --split-input-file | FileCheck %s
-
-// Check that we can specify `num_threads` when lowering
-// `workgroup_count_from_slice` using
-// `transform.iree.populate_workgroup_count_region_using_num_threads_slice`
-
-
-#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>
-
-// Check that num_threads (32) is reflected in the map.
-// CHECK: #[[$NUM_THREADS_MAP:.*]] = affine_map<(d0) -> (d0 * 32)>
-
-hal.executable private @matmul_static_dispatch_0 {
-  hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
-
-    hal.executable.export public @matmul_static_dispatch_0_matmul_1024x4096x12345 ordinal(0) layout(#pipeline_layout) {
-    // Check that num_threads is reflected in the workgroup size.
-    // CHECK-LABEL: hal.executable.export public @matmul_static_dispatch_0_matmul_1024x4096x12345
-    // CHECK-DAG: %[[C32:.*]] = arith.constant 32 : index
-    // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
-    // CHECK: hal.return %[[C32]], %[[C1]], %[[C1]] : index, index, index
-    ^bb0(%arg0: !hal.device):
-      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
-      hal.return %x, %y, %z : index, index, index
-    }
-
-    builtin.module {
-      func.func @matmul_static_dispatch_0_matmul_1024x4096x12345() {
-        // Check that the tiling matches num_threads.
-        // CHECK-LABEL: func.func @matmul_static_dispatch_0_matmul_1024x4096x12345
-        // CHECK: = scf.forall (%[[IV:.*]]) in (32) shared_outs(%{{.*}}) -> (tensor<1024x4096xf32>) {
-        // CHECK: %[[OFFSET:.*]] = affine.apply #[[$NUM_THREADS_MAP]](%[[IV]])
-        // CHECK: %extracted_slice = tensor.extract_slice %{{.*}}[%[[OFFSET]], 0] [32, 12345] [1, 1] : tensor<1024x12345xf32> to tensor<32x12345xf32>
-        %c0 = arith.constant 0 : index
-        %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x12345xf32>>
-        %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<12345x4096xf32>>
-        %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<1024x4096xf32>>
-        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 12345], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x12345xf32>> -> tensor<1024x12345xf32>
-        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [12345, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<12345x4096xf32>> -> tensor<12345x4096xf32>
-        %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1024, 4096], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<1024x4096xf32>> -> tensor<1024x4096xf32>
-        %6 = linalg.matmul ins(%3, %4 : tensor<1024x12345xf32>, tensor<12345x4096xf32>) outs(%5 : tensor<1024x4096xf32>) -> tensor<1024x4096xf32>
-        flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1024, 4096], strides = [1, 1] : tensor<1024x4096xf32> -> !flow.dispatch.tensor<readwrite:tensor<1024x4096xf32>>
-        return
-      }
-    }
-  }
-}
-
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
-  %original_matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op
-    : (!transform.any_op) -> !transform.any_op
-
-  %matmul, %forall =
-    transform.structured.tile_using_forall %original_matmul num_threads [32]
-      ( mapping = [#gpu.block<x>] )
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  // Late canonicalizations to cleanup and pass the checks.
-  // Needs to occur on the whole variant to perform cse on the workgroup_count region
-  transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall
-    : (!transform.any_op) -> ()
-}
-
-// -----
-
-#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>
-
-hal.executable private @matmul_static_dispatch_0 {
-  hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
-
-    hal.executable.export public @elementwise_out_of_order_block_id ordinal(0) layout(#pipeline_layout) {
-    // Check that num_threads is consistent with the specified mapping
-    // CHECK-LABEL: hal.executable.export public @elementwise_out_of_order_block_id
-
-    // CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index
-    // CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index
-    // CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index
-    // CHECK: hal.return %[[C3]], %[[C5]], %[[C8]] : index, index, index
-    ^bb0(%arg0: !hal.device):
-      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
-      hal.return %x, %y, %z : index, index, index
-    }
-
-    builtin.module {
-      func.func @elementwise_out_of_order_block_id() {
-        // CHECK-LABEL: func.func @elementwise_out_of_order_block_id
-        // CHECK: = scf.forall (%[[IV:.*]]) in (3, 5, 8) shared_outs(%{{.*}}) -> (tensor<3x5x8xf32>) {
-        // CHECK: } {mapping = [#gpu.block<x>, #gpu.block<y>, #gpu.block<z>]}
-        %c0 = arith.constant 0 : index
-        %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<3x5x8xf32>>
-        %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<3x5x8xf32>>
-        %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [3, 5, 8], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x5x8xf32>> -> tensor<3x5x8xf32>
-        %empty = tensor.empty() : tensor<3x5x8xf32>
-        %3 = linalg.generic {
-          indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
-                           affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
-          iterator_types = ["parallel", "parallel", "parallel"]}
-          ins(%2 : tensor<3x5x8xf32>) outs(%empty : tensor<3x5x8xf32>) {
-          ^bb0(%in: f32, %in_0: f32):
-            %4 = math.sqrt %in : f32
-            linalg.yield %4 : f32
-          } -> tensor<3x5x8xf32>
-        flow.dispatch.tensor.store %3, %1, offsets = [0, 0, 0], sizes = [3, 5, 8], strides = [1, 1, 1] : tensor<3x5x8xf32> -> !flow.dispatch.tensor<writeonly:tensor<3x5x8xf32>>
-        return
-      }
-    }
-  }
-}
-
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
-  %1 = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %tiled_op, %forall_op = transform.structured.tile_using_forall %1   num_threads [] tile_sizes [1, 1, 1](mapping = [#gpu.block<x>, #gpu.block<y>, #gpu.block<z>]): (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_op : (!transform.any_op) -> ()
-}
-
-// -----
-
-#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
-
-hal.executable private @matmul_static_dispatch_0 {
-  hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
-
-    hal.executable.export public @vecadd2d_dispatch_0_generic_9x512_f32 ordinal(0) layout(#pipeline_layout) {
-    // Check that num_threads is consistent with the specified mapping
-    // CHECK-LABEL: hal.executable.export public @vecadd2d_dispatch_0_generic_9x512_f32
-
-    // CHECK-DAG: %[[C171:.*]] = arith.constant 171 : index
-    // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
-    // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
-    // CHECK: hal.return %[[C171]], %[[C1]], %[[C2]] : index, index, index
-    ^bb0(%arg0: !hal.device):
-      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
-      hal.return %x, %y, %z : index, index, index
-    }
-
-    builtin.module {
-      func.func @vecadd2d_dispatch_0_generic_9x512_f32() {
-        %c18432 = arith.constant 18432 : index
-        %c0 = arith.constant 0 : index
-        %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c18432) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<9x512xf32>>
-        %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x9xf32>>
-        %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<512x9xf32>>
-        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [9, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<9x512xf32>> -> tensor<9x512xf32>
-        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 9], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x9xf32>> -> tensor<512x9xf32>
-        %5 = tensor.empty() : tensor<512x9xf32>
-        %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1, d0)>], iterator_types = ["parallel", "parallel"]} ins(%3, %4 : tensor<9x512xf32>, tensor<512x9xf32>) outs(%5 : tensor<512x9xf32>) {
-        ^bb0(%in: f32, %in_0: f32, %out: f32):
-          %7 = arith.addf %in, %in_0 : f32
-          linalg.yield %7 : f32
-        } -> tensor<512x9xf32>
-        flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [512, 9], strides = [1, 1] : tensor<512x9xf32> -> !flow.dispatch.tensor<writeonly:tensor<512x9xf32>>
-        return
-      }
-    }
-  }
-}
-
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
-  %1 = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %tiled_op, %forall_op = transform.structured.tile_using_forall %1   num_threads [] tile_sizes [5, 3](mapping = [#gpu.block<z>, #gpu.block<x>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_op : (!transform.any_op) -> ()
-}

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
index 3d72d66..a05a27d 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel

@@ -66,6 +66,7 @@
         # tensor_dialect_*_spec is a an MLIR file that specifies a
         # transformation, it needs to be included as data.
         exclude = [
+            "attention_transform_spec.mlir",
             "transform_dialect_codegen_bufferize_spec.mlir",
             "transform_dialect_codegen_foreach_to_gpu_spec.mlir",
             "transform_dialect_codegen_vector_distribution_spec.mlir",
@@ -74,6 +75,7 @@
     ),
     cfg = "//compiler:lit.cfg.py",
     data = [
+        "attention_transform_spec.mlir",
         "transform_dialect_codegen_bufferize_spec.mlir",
         "transform_dialect_codegen_foreach_to_gpu_spec.mlir",
         "transform_dialect_codegen_vector_distribution_spec.mlir",

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
index 0997aa8..562dbad 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt

@@ -61,6 +61,7 @@
     FileCheck
     iree-opt
   DATA
+    attention_transform_spec.mlir
     transform_dialect_codegen_bufferize_spec.mlir
     transform_dialect_codegen_foreach_to_gpu_spec.mlir
     transform_dialect_codegen_vector_distribution_spec.mlir

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention.mlir
index 313a3f1..840fa2a 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention.mlir

@@ -1,6 +1,5 @@
-// RUN: iree-opt %s --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-select-lowering-strategy, iree-llvmgpu-lower-executable-target)))' \
-// RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=%s | \
+// RUN: iree-opt %s --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-transform-dialect-interpreter)))' \
+// RUN:   --iree-codegen-transform-dialect-library=%p/attention_transform_spec.mlir| \
 // RUN: FileCheck --check-prefix=CHECK %s
 
 hal.executable @_attention_dispatch_0 {
@@ -29,133 +28,6 @@
   }
 }
 
-transform.sequence failures(propagate) {
-  ^bb0(%variant_op: !transform.any_op):
-
-  // Get attention op
-  // ==========================================
-  %attention = transform.structured.match ops{["iree_linalg_ext.attention"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-
-  // Tile and distribute to workgroups
-  // ==========================================
-  %tiled_attention, %forall_grid =
-  transform.structured.tile_using_forall %attention tile_sizes [1, 128]
-    ( mapping = [#gpu.block<x>, #gpu.block<y>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
-
-  // Tile batch dimensions of attention
-  // ==========================================
-  %attention2 = transform.structured.match ops{["iree_linalg_ext.attention"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %batch_tiled_attn, %loop = transform.structured.tile_using_for %attention2 [1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %top_level_func {
-    transform.apply_patterns.canonicalization
-  } : !transform.any_op
-  transform.iree.apply_cse %top_level_func : !transform.any_op
-
-  // Promote query and output operands
-  // ==========================================
-  %attention3 = transform.structured.match ops{["iree_linalg_ext.attention"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %promoted_attention, %alloc_a0, %alloc_a1 = transform.iree.promote_operands %attention3 [0, 3]
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-  // Tile and decompose attention
-  // ==========================================
-  %attention4 = transform.structured.match ops{["iree_linalg_ext.attention"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %acc_fill, %max_fill, %sum_fill, %inner_loop,
-  %fill_op, %first_matmul, %reduce_max, %partial_softmax, %update, %reduce_sum, %reciprocal_sum, %softmax, %truncate, %scale_acc, %second_matmul, %last_truncate
-      = tile_and_decompose_attention %attention4 :
-     (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-
-  // Promote key and value operands
-  // ==========================================
-  %promoted_first_matmul, %alloc0 = transform.iree.promote_operands %first_matmul [1]
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %promoted_second_matmul, %alloc1 = transform.iree.promote_operands %second_matmul [1]
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  // Tile and fuse attention ops
-  // ==========================================
-  %tiled_matmul, %forall = transform.structured.tile_using_forall %promoted_second_matmul tile_sizes [32] (mapping = [#gpu.warp<linear_dim_0>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  %f0, %loop0 = transform.structured.fuse_into_containing_op %scale_acc into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %f1, %loop1 = transform.structured.fuse_into_containing_op %truncate into %loop0 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %f2, %loop2 = transform.structured.fuse_into_containing_op %softmax into %loop1 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  transform.iree.apply_cse %func : !transform.any_op
-
-  %f3, %loop3 = transform.structured.fuse_into_containing_op %reciprocal_sum into %loop2 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %f4, %loop4 = transform.structured.fuse_into_containing_op %reduce_sum into %loop3 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.iree.apply_cse %func : !transform.any_op
-
-  %f5, %loop5 = transform.structured.fuse_into_containing_op %update into %loop4 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %f6, %loop6 = transform.structured.fuse_into_containing_op %partial_softmax into %loop5 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.iree.apply_cse %func : !transform.any_op
-
-  %f7, %loop7 = transform.structured.fuse_into_containing_op %reduce_max into %loop6 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %f8, %loop8 = transform.structured.fuse_into_containing_op %promoted_first_matmul into %loop7 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %f9, %loop9 = transform.structured.fuse_into_containing_op %fill_op into %loop8 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.iree.apply_cse %func : !transform.any_op
-
-  // Distribute fills and last truncate
-  // ==========================================
-  %fills = transform.merge_handles %acc_fill, %max_fill, %sum_fill, %last_truncate : !transform.any_op
-  %tiled_fill, %fill_grid = transform.structured.tile_using_forall %fills tile_sizes[32] (mapping = [#gpu.warp<linear_dim_0>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  // Vectorize function
-  // ==========================================
-  transform.apply_patterns to %func {
-    transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-    transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-    transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-  } : !transform.any_op
-  %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> (!transform.any_op)
-
-  // Bufferization
-  // ==========================================
-  transform.apply_patterns to %func_3 {
-     transform.apply_patterns.tensor.reassociative_reshape_folding
-     transform.apply_patterns.canonicalization
-     transform.apply_patterns.iree.fold_fill_into_pad
-     transform.apply_patterns.linalg.tiling_canonicalization
-     transform.apply_patterns.scf.for_loop_canonicalization
-  } : !transform.any_op
-  transform.iree.apply_cse %func_3 : !transform.any_op
-  transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
-  transform.apply_patterns to %func_3 { transform.apply_patterns.linalg.erase_unnecessary_inputs } : !transform.any_op
-  %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
-
-  // Step 5. Pre-process the contract and transfer ops to put it in the right form.
-  // ===========================================================================
-  %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %func_2 {
-    transform.apply_patterns.iree.prepare_vector_to_mma
-  } : !transform.any_op
-
-  // Step 6. Post-bufferization vector distribution
-  // ===========================================================================
-  %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-  transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
-  transform.iree.map_nested_forall_to_gpu_threads %func_7 workgroup_dims = [4, 8, 4] subgroup_size = 32 : (!transform.any_op) -> ()
-
-  transform.apply_patterns to %func_7 {
-     transform.apply_patterns.memref.fold_memref_alias_ops
-  } : !transform.any_op
-  transform.iree.apply_licm %func_7 : !transform.any_op
-  transform.apply_patterns to %func_7 {
-     transform.apply_patterns.canonicalization
-  } : !transform.any_op
-  transform.iree.apply_cse %func_7 : !transform.any_op
-  %func_8 = transform.structured.hoist_redundant_vector_transfers %func_7
-  : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %func_8 {
-    transform.apply_patterns.canonicalization
-  } : !transform.any_op
-  transform.iree.apply_cse %func_8 : !transform.any_op
-  transform.memref.erase_dead_alloc_and_stores %func_8 : (!transform.any_op) -> ()
-}
-
 // CHECK-DAG:  #[[MAP:.+]] = affine_map<()[s0] -> (s0 * 128)>
 // CHECK-DAG:  #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 // CHECK-DAG:  #[[MAP2:.+]] = affine_map<()[s0, s1, s2] -> (s2 * 32 + ((s0 + s1 * 4) floordiv 32) * 32)>

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_transform_spec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_transform_spec.mlir
new file mode 100644
index 0000000..ae62eb4
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_transform_spec.mlir

@@ -0,0 +1,128 @@
+
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.consumed}) {
+    // Get attention op
+    // ==========================================
+    %attention = transform.structured.match ops{["iree_linalg_ext.attention"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+
+    // Tile and distribute to workgroups
+    // ==========================================
+    %tiled_attention, %forall_grid =
+    transform.structured.tile_using_forall %attention tile_sizes [1, 128]
+      ( mapping = [#gpu.block<x>, #gpu.block<y>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
+
+    // Tile batch dimensions of attention
+    // ==========================================
+    %attention2 = transform.structured.match ops{["iree_linalg_ext.attention"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    %batch_tiled_attn, %loop = transform.structured.tile_using_for %attention2 [1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %top_level_func {
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.iree.apply_cse %top_level_func : !transform.any_op
+
+    // Promote query and output operands
+    // ==========================================
+    %attention3 = transform.structured.match ops{["iree_linalg_ext.attention"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    %promoted_attention, %alloc_a0, %alloc_a1 = transform.iree.promote_operands %attention3 [0, 3]
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+
+    // Tile and decompose attention
+    // ==========================================
+    %attention4 = transform.structured.match ops{["iree_linalg_ext.attention"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    %acc_fill, %max_fill, %sum_fill, %inner_loop,
+    %fill_op, %first_matmul, %reduce_max, %partial_softmax, %update, %reduce_sum, %reciprocal_sum, %softmax, %truncate, %scale_acc, %second_matmul, %last_truncate
+        = transform.tile_and_decompose_attention %attention4 :
+      (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+
+    // Promote key and value operands
+    // ==========================================
+    %promoted_first_matmul, %alloc0 = transform.iree.promote_operands %first_matmul [1]
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %promoted_second_matmul, %alloc1 = transform.iree.promote_operands %second_matmul [1]
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    // Tile and fuse attention ops
+    // ==========================================
+    %tiled_matmul, %forall = transform.structured.tile_using_forall %promoted_second_matmul tile_sizes [32] (mapping = [#gpu.warp<linear_dim_0>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    %f0, %loop0 = transform.structured.fuse_into_containing_op %scale_acc into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %f1, %loop1 = transform.structured.fuse_into_containing_op %truncate into %loop0 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %f2, %loop2 = transform.structured.fuse_into_containing_op %softmax into %loop1 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    transform.iree.apply_cse %func : !transform.any_op
+
+    %f3, %loop3 = transform.structured.fuse_into_containing_op %reciprocal_sum into %loop2 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %f4, %loop4 = transform.structured.fuse_into_containing_op %reduce_sum into %loop3 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.iree.apply_cse %func : !transform.any_op
+
+    %f5, %loop5 = transform.structured.fuse_into_containing_op %update into %loop4 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %f6, %loop6 = transform.structured.fuse_into_containing_op %partial_softmax into %loop5 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.iree.apply_cse %func : !transform.any_op
+
+    %f7, %loop7 = transform.structured.fuse_into_containing_op %reduce_max into %loop6 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %f8, %loop8 = transform.structured.fuse_into_containing_op %promoted_first_matmul into %loop7 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %f9, %loop9 = transform.structured.fuse_into_containing_op %fill_op into %loop8 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.iree.apply_cse %func : !transform.any_op
+
+    // Distribute fills and last truncate
+    // ==========================================
+    %fills = transform.merge_handles %acc_fill, %max_fill, %sum_fill, %last_truncate : !transform.any_op
+    %tiled_fill, %fill_grid = transform.structured.tile_using_forall %fills tile_sizes[32] (mapping = [#gpu.warp<linear_dim_0>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    // Vectorize function
+    // ==========================================
+    transform.apply_patterns to %func {
+      transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+      transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+      transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+    } : !transform.any_op
+    %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> (!transform.any_op)
+
+    // Bufferization
+    // ==========================================
+    transform.apply_patterns to %func_3 {
+      transform.apply_patterns.tensor.reassociative_reshape_folding
+      transform.apply_patterns.canonicalization
+      transform.apply_patterns.iree.fold_fill_into_pad
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.scf.for_loop_canonicalization
+    } : !transform.any_op
+    transform.iree.apply_cse %func_3 : !transform.any_op
+    transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+    transform.apply_patterns to %func_3 { transform.apply_patterns.linalg.erase_unnecessary_inputs } : !transform.any_op
+    %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
+
+    // Step 5. Pre-process the contract and transfer ops to put it in the right form.
+    // ===========================================================================
+    %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func_2 {
+      transform.apply_patterns.iree.prepare_vector_to_mma
+    } : !transform.any_op
+
+    // Step 6. Post-bufferization vector distribution
+    // ===========================================================================
+    %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
+    transform.iree.map_nested_forall_to_gpu_threads %func_7 workgroup_dims = [4, 8, 4] subgroup_size = 32 : (!transform.any_op) -> ()
+
+    transform.apply_patterns to %func_7 {
+      transform.apply_patterns.memref.fold_memref_alias_ops
+    } : !transform.any_op
+    transform.iree.apply_licm %func_7 : !transform.any_op
+    transform.apply_patterns to %func_7 {
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.iree.apply_cse %func_7 : !transform.any_op
+    %func_8 = transform.structured.hoist_redundant_vector_transfers %func_7
+    : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func_8 {
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.iree.apply_cse %func_8 : !transform.any_op
+    transform.memref.erase_dead_alloc_and_stores %func_8 : (!transform.any_op) -> ()
+    transform.yield 
+  }
+} // module

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/create_async_groups.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/create_async_groups.mlir
index a716a2f..1fc2285 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/create_async_groups.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/create_async_groups.mlir

@@ -21,13 +21,15 @@
     // CHECK: nvgpu.device_async_wait %[[G]]
     return
   }
+}
 
-  transform.sequence failures(propagate) {
-  ^bb1(%variant_op: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
     %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
     transform.iree.create_async_groups %top_level_func {use_mma_sync} : (!transform.any_op) -> ()
+    transform.yield 
   }
-}
+} // module
 
 // -----
 
@@ -53,13 +55,15 @@
     // CHECK: nvgpu.device_async_wait %[[G]]
     return
   }
+}
 
-  transform.sequence failures(propagate) {
-  ^bb1(%variant_op: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
     %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
     transform.iree.create_async_groups %top_level_func : (!transform.any_op) -> ()
+    transform.yield 
   }
-}
+} // module
 
 // -----
 
@@ -80,15 +84,17 @@
     vector.transfer_write %2, %0[%c0, %c4, %c0] {in_bounds = [true]} : vector<1xf32>, memref<4x32x16xf32, #gpu.address_space<workgroup>>
     return
   }
+}
 
-  transform.sequence failures(propagate) {
-  ^bb1(%variant_op: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
     %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
     %vector_transfer = transform.structured.match ops{["memref.alloc"]} in %top_level_func : (!transform.any_op) -> !transform.any_op
     // expected-error@below {{transform applied to the wrong op kind}}
     transform.iree.create_async_groups %vector_transfer : (!transform.any_op) -> ()
+    transform.yield 
   }
-}
+} // module
 
 // -----
 
@@ -112,13 +118,15 @@
     // CHECK: nvgpu.device_async_wait %[[G]]
     return
   }
+}
 
-  transform.sequence failures(propagate) {
-  ^bb1(%variant_op: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
     %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
     transform.iree.create_async_groups %top_level_func : (!transform.any_op) -> ()
+    transform.yield 
   }
-}
+} // module
 
 // -----
 
@@ -148,10 +156,12 @@
     // CHECK-NOT: nvgpu.device_async_create_group
     return
   }
+}
 
-  transform.sequence failures(propagate) {
-  ^bb1(%variant_op: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
     %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
     transform.iree.create_async_groups %top_level_func : (!transform.any_op) -> ()
+    transform.yield 
   }
-}
+} // module

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/layout_analysis_and_distribution.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/layout_analysis_and_distribution.mlir
index 689e1d2..bc34439 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/layout_analysis_and_distribution.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/layout_analysis_and_distribution.mlir

@@ -1,6 +1,6 @@
 // RUN: iree-opt %s -iree-transform-dialect-interpreter -transform-dialect-drop-schedule -cse -split-input-file --verify-diagnostics | FileCheck %s
 
-builtin.module {
+builtin.module attributes { transform.with_named_sequence } {
   func.func @matmul_dispatch_0_matmul_16x8x16() {
     %c0 = arith.constant 0 : index
     %cst = arith.constant dense<0.000000e+00> : vector<16x8xf16>
@@ -17,12 +17,14 @@
     vector.transfer_write %5, %2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<16x8xf16>
     return
   }
-  transform.sequence failures(propagate) {
-  ^bb1(%variant_op: !transform.any_op):
+
+  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
     %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
     %transformed_func = transform.iree.layout_analysis_and_distribution %top_level_func : (!transform.any_op) -> (!transform.any_op)
+    transform.yield 
   }
-}
+} // module
+
 
 // CHECK-DAG:  #[[MAP:.+]] = affine_map<(d0, d1, d2) -> (d1 + d2 * 16)>
 // CHECK-DAG:  #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d0 * 2)>
@@ -128,7 +130,7 @@
 
 // -----
 
-builtin.module {
+builtin.module attributes { transform.with_named_sequence } {
   func.func @matmul_reduction() {
     %c0 = arith.constant 0 : index
     %cst = arith.constant dense<0.000000e+00> : vector<16x8xf16>
@@ -149,12 +151,13 @@
     vector.transfer_write %8, %2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<16x8xf16>
     return
   }
-  transform.sequence failures(propagate) {
-  ^bb1(%variant_op: !transform.any_op):
+  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
     %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
     %transformed_func = transform.iree.layout_analysis_and_distribution %top_level_func : (!transform.any_op) -> (!transform.any_op)
+    transform.yield 
   }
-}
+} // module
+
 
 // CHECK-DAG:  #[[MAP:.+]] = affine_map<(d0, d1, d2) -> (d1 + d2 * 16)>
 // CHECK-DAG:  #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d0 * 2)>
@@ -318,7 +321,7 @@
 #map4 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map5 = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map6 = affine_map<(d0, d1, d2) -> (d0, d1)>
-builtin.module {
+builtin.module attributes { transform.with_named_sequence } {
   func.func @matmul_scf() {
     %cst = arith.constant 0.000000e+00 : f16
     %c0 = arith.constant 0 : index
@@ -348,12 +351,13 @@
     vector.transfer_write %7, %3[%8, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<16x8xf16>
     return
   }
-  transform.sequence failures(propagate) {
-  ^bb1(%variant_op: !transform.any_op):
+  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
     %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
     %transformed_func = transform.iree.layout_analysis_and_distribution %top_level_func : (!transform.any_op) -> (!transform.any_op)
+    transform.yield 
   }
-}
+} // module
+
 
 // CHECK-DAG:  #[[MAP:.+]] = affine_map<()[s0] -> (s0 * 16)>
 // CHECK-DAG:  #[[MAP1:.+]] = affine_map<(d0)[s0] -> (d0 + s0)>
@@ -505,7 +509,7 @@
 #map4 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map5 = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map6 = affine_map<(d0, d1, d2) -> (d0, d1)>
-builtin.module {
+builtin.module attributes { transform.with_named_sequence } {
   func.func @matmul_scf() {
     %cst = arith.constant 0.000000e+00 : f16
     %c0 = arith.constant 0 : index
@@ -535,12 +539,13 @@
     vector.transfer_write %7, %3[%8, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<16x8xf16>
     return
   }
-  transform.sequence failures(propagate) {
-  ^bb1(%variant_op: !transform.any_op):
+  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
     %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
     %transformed_func = transform.iree.layout_analysis_and_distribution %top_level_func : (!transform.any_op) -> (!transform.any_op)
+    transform.yield 
   }
-}
+} // module
+
 
 // CHECK-DAG:  #[[MAP:.+]] = affine_map<()[s0] -> (s0 * 16)>
 // CHECK-DAG:  #[[MAP1:.+]] = affine_map<(d0) -> (d0 * 16)>
@@ -679,7 +684,7 @@
 #map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map3 = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map4 = affine_map<(d0, d1, d2) -> (d0, d1)>
-builtin.module {
+builtin.module attributes { transform.with_named_sequence } {
   func.func @matmul_dispatch_0_matmul_16x8x16() {
     %c0 = arith.constant 0 : index
     %cst = arith.constant dense<0.000000e+00> : vector<16x8xf16>
@@ -702,12 +707,13 @@
     vector.transfer_write %10, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<16x8xf16, strided<[8, 1], offset: ?>>
     return
   }
-  transform.sequence failures(propagate) {
-  ^bb1(%variant_op: !transform.any_op):
+  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
     %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
     %transformed_func = transform.iree.layout_analysis_and_distribution %top_level_func : (!transform.any_op) -> (!transform.any_op)
+    transform.yield 
   }
-}
+} // module
+
 
 // CHECK-DAG:  #[[MAP:.+]] = affine_map<(d0, d1, d2) -> (d1 + d2 * 16)>
 // CHECK-DAG:  #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d0 * 2)>
@@ -836,7 +842,7 @@
 
 // -----
 
-builtin.module {
+builtin.module attributes { transform.with_named_sequence } {
   func.func @matmul_dispatch_0_matmul_16x8x16_shared() {
     %c0 = arith.constant 0 : index
     %cst = arith.constant dense<0.000000e+00> : vector<16x8xf16>
@@ -851,12 +857,12 @@
     vector.transfer_write %5, %2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<16x8xf16>
     return
   }
-  transform.sequence failures(propagate) {
-  ^bb1(%variant_op: !transform.any_op):
+  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
     %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
     %transformed_func = transform.iree.layout_analysis_and_distribution %top_level_func : (!transform.any_op) -> (!transform.any_op)
+    transform.yield 
   }
-}
+} // module
 
 // CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2) -> (d1 + d2 * 16)>
 // CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d0 * 2)>
@@ -913,7 +919,7 @@
 #map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map3 = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map4 = affine_map<(d0, d1, d2) -> (d0, d1)>
-builtin.module {
+builtin.module attributes { transform.with_named_sequence } {
   func.func @matmul_dispatch_0_matmul_16x16x16_f16() {
     %c0 = arith.constant 0 : index
     %cst = arith.constant dense<0.000000e+00> : vector<16x16xf16>
@@ -973,13 +979,13 @@
     vector.transfer_write %32, %subview[%c0_0, %c0_0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<16x8xf16, strided<[8, 1], offset: ?>>
     return
   }
-  transform.sequence failures(propagate) {
-  ^bb1(%variant_op: !transform.any_op):
+  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
     %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
     %reordered_func = transform.iree.reorder_transpose %top_level_func : (!transform.any_op) -> !transform.any_op
-     transform.iree.apply_cse %reordered_func : !transform.any_op
+    transform.iree.apply_cse %reordered_func : !transform.any_op
+    transform.yield 
   }
-}
+} // module
 
 // CHECK-DAG:  #[[MAP:.+]] = affine_map<(d0) -> (d0 * 16)>
 // CHECK-DAG:  #[[MAP1:.+]] = affine_map<(d0, d1) -> (d1, d0)>
@@ -1064,7 +1070,7 @@
 #map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map3 = affine_map<(d0, d1, d2) -> (d0, d1)>
-builtin.module {
+builtin.module attributes { transform.with_named_sequence } {
   func.func @double_matmul_dispatch_0_matmul_16x16x16() {
     %c0 = arith.constant 0 : index
     %cst = arith.constant dense<0.000000e+00> : vector<16x16xf16>
@@ -1099,12 +1105,12 @@
     vector.transfer_write %9, %subview[%c0_1, %c0_1] {in_bounds = [true, true]} : vector<16x8xf16>, memref<16x8xf16, strided<[8, 1], offset: ?>>
     return
   }
-  transform.sequence failures(propagate) {
-  ^bb1(%variant_op: !transform.any_op):
+  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
     %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
     %transformed_func = transform.iree.layout_analysis_and_distribution %top_level_func : (!transform.any_op) -> (!transform.any_op)
+    transform.yield 
   }
-}
+} // module
 
 // CHECK-DAG:  #[[MAP:.+]] = affine_map<(d0) -> (d0 * 16)>
 // CHECK-DAG:  #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d1 + d2 * 16)>

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/linalg_transform.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/linalg_transform.mlir
index e6c4d1b..dff5d2c 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/linalg_transform.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/linalg_transform.mlir

@@ -1,11 +1,11 @@
 // RUN: iree-opt %s  --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-select-lowering-strategy, iree-llvmgpu-lower-executable-target)))" \
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=%p/transform_dialect_codegen_bufferize_spec.mlir | \
+// RUN:     --iree-codegen-transform-dialect-library=%p/transform_dialect_codegen_bufferize_spec.mlir | \
 // RUN: FileCheck %s
 
 // RUN: iree-opt %s  --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-select-lowering-strategy, iree-llvmgpu-lower-executable-target)))" \
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=%p/transform_dialect_codegen_foreach_to_gpu_spec.mlir | \
+// RUN:     --iree-codegen-transform-dialect-library=%p/transform_dialect_codegen_foreach_to_gpu_spec.mlir | \
 // RUN: FileCheck %s --check-prefix=FOREACH-TO-GPU
 
 #device_target_cuda = #hal.device.target<"cuda", {executable_targets = [#hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_60"}>]}>
@@ -14,7 +14,11 @@
 module attributes {hal.device.targets = [#device_target_cuda]} {
   hal.executable private @matmul_static_dispatch_0 {
     hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) {
-      hal.executable.export public @matmul_static_dispatch_0 ordinal(0) layout(#pipeline_layout)
+      hal.executable.export public @matmul_static_dispatch_0 ordinal(0) layout(#pipeline_layout){
+      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
+        %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
+        hal.return %x, %y, %z : index, index, index
+      }
       builtin.module {
         func.func @matmul_static_dispatch_0() {
           %c0 = arith.constant 0 : index

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_batch_matmul.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_batch_matmul.mlir
index edbbfa4..839d389 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_batch_matmul.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_batch_matmul.mlir

@@ -81,7 +81,7 @@
 }
 
 
-// CHECK: transform.sequence  failures(propagate) {
+// CHECK: transform.named_sequence
 // CHECK:   transform.iree.register_match_callbacks
 // CHECK:   %[[MATCH:.+]]:2 = transform.iree.match_callback failures(propagate) "batch_matmul"
 // CHECK:   %[[TILED:.+]], %[[FORALL:.+]] = transform.structured.tile_using_forall %[[MATCH]]#1
@@ -98,7 +98,7 @@
 // CHECK:   %[[PADDED:.+]], %{{.*}}, %{{.+}} = transform.structured.pad %tiled_linalg_op
 // CHECK:     pack_paddings = [1, 1, 1, 1], pad_to_multiple_of = [1, 1, 1, 1], padding_dimensions = [0, 1, 2, 3]
 // CHECK:     padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]}
-// CHECK:   %[[V3:.+]] = get_producer_of_operand %[[PADDED]][2]
+// CHECK:   %[[V3:.+]] = transform.get_producer_of_operand %[[PADDED]][2]
 // CHECK:   transform.structured.hoist_pad %{{.*}} by 1 loops
 // CHECK:   apply_patterns
 // CHECK:   transform.iree.apply_licm
@@ -109,8 +109,8 @@
 // CHECK:   transform.iree.apply_cse
 // CHECK:   transform.structured.match ops{["tensor.parallel_insert_slice"]}
 // CHECK:   transform.structured.insert_slice_to_copy
-// CHECK:   %[[LHS:.+]] = get_producer_of_operand %[[PADDED]][0]
-// CHECK:   %[[RHS:.+]] = get_producer_of_operand %[[PADDED]][1]
+// CHECK:   %[[LHS:.+]] = transform.get_producer_of_operand %[[PADDED]][0]
+// CHECK:   %[[RHS:.+]] = transform.get_producer_of_operand %[[PADDED]][1]
 // CHECK:   %[[RHS_DPS:.+]] = transform.structured.rewrite_in_destination_passing_style %[[RHS]]
 
 // CHECK:   transform.structured.tile_using_forall %[[LHS]] 

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_convolution.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_convolution.mlir
index 0c18806..3345f41 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_convolution.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_convolution.mlir

@@ -29,10 +29,10 @@
 
 // CHECK-LABEL: func @nchw_convolution
 
-// CHECK: transform.sequence  failures(propagate) {
+// CHECK: transform.named_sequence
 // CHECK: transform.iree.match_callback failures(propagate) "convolution"
 // CHECK: transform.structured.convert_conv2d_to_img2col
-// CHECK: get_producer_of_operand %{{.*}}[0]
+// CHECK: transform.get_producer_of_operand %{{.*}}[0]
 // CHECK: transform.apply_patterns.iree.bubble_collapse
 // CHECK: transform.structured.tile_using_forall %{{.*}}   tile_sizes [1, 128, 128](mapping = [#gpu.block<z>, #gpu.block<y>, #gpu.block<x>])
 // CHECK: transform.structured.fuse_into_containing_op
@@ -44,10 +44,10 @@
 // CHECK: transform.structured.fuse_into_containing_op
 // CHECK: transform.structured.pad %{{.*}} {copy_back_op = "none", pack_paddings = [1, 0, 1], pad_to_multiple_of = [1, 1, 1, 1], padding_dimensions = [0, 1, 2, 3], padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]}
 // CHECK: transform.structured.match ops{["linalg.fill"]}
-// CHECK: %[[RES:.+]] = get_producer_of_operand %{{.*}}[2]
+// CHECK: %[[RES:.+]] = transform.get_producer_of_operand %{{.*}}[2]
 // CHECK: transform.structured.rewrite_in_destination_passing_style %[[RES]]
-// CHECK: %[[LHS:.+]] = get_producer_of_operand %{{.*}}[0]
-// CHECK: %[[RHS:.+]] = get_producer_of_operand %{{.*}}[1]
+// CHECK: %[[LHS:.+]] = transform.get_producer_of_operand %{{.*}}[0]
+// CHECK: %[[RHS:.+]] = transform.get_producer_of_operand %{{.*}}[1]
 // CHECK: transform.structured.rewrite_in_destination_passing_style %[[LHS]]
 // CHECK: transform.structured.tile_using_forall %{{.*}}   num_threads [32, 4](mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
 // CHECK: transform.structured.tile_using_forall %[[RHS]]   num_threads [1, 4, 32](mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
@@ -100,13 +100,13 @@
 
 // CHECK-LABEL: func @nhwc_convolution
 
-// CHECK: transform.sequence  failures(propagate) {
+// CHECK: transform.named_sequence
 // CHECK: transform.structured.tile_using_forall %{{.*}}   tile_sizes [1, 128, 128](mapping = [#gpu.block<z>, #gpu.block<y>, #gpu.block<x>])
 // CHECK: transform.structured.pad %{{.*}} {copy_back_op = "none", pack_paddings = [0, 1, 1], pad_to_multiple_of = [1, 1, 1, 1], padding_dimensions = [0, 1, 2, 3], padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]}
-// CHECK: %[[RES:.+]] = get_producer_of_operand %{{.*}}[2]
+// CHECK: %[[RES:.+]] = transform.get_producer_of_operand %{{.*}}[2]
 // CHECK: transform.structured.rewrite_in_destination_passing_style %[[RES]]
-// CHECK: %[[LHS:.+]] = get_producer_of_operand %{{.*}}[0]
-// CHECK: %[[RHS:.+]] = get_producer_of_operand %{{.*}}[1]
+// CHECK: %[[LHS:.+]] = transform.get_producer_of_operand %{{.*}}[0]
+// CHECK: %[[RHS:.+]] = transform.get_producer_of_operand %{{.*}}[1]
 // CHECK: transform.structured.rewrite_in_destination_passing_style %[[RHS]]
 // CHECK: transform.structured.tile_using_forall %[[LHS]]   num_threads [1, 32, 4](mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
 // CHECK: transform.structured.tile_using_forall %{{.*}}   num_threads [4, 32](mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
@@ -148,4 +148,4 @@
 // CHECK-LABEL: func @unaligned_convolution
 
 // Currently padding on the img2col op is not supported so bail out for unaligned.
-// CHECK-NOT: transform.sequence
+// CHECK-NOT: transform.named_sequence

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_matmul.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_matmul.mlir
index 1c8017d..34e2eba 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_matmul.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_matmul.mlir

@@ -67,7 +67,7 @@
 
 // CHECK-LABEL: func @matmul_1
 
-// CHECK: transform.sequence  failures(propagate) {
+// CHECK: transform.named_sequence
 // CHECK: transform.iree.match_callback failures(propagate) "matmul"
 // CHECK: transform.structured.tile_using_forall %{{.*}} tile_sizes [128, 128](mapping = [#gpu.block<y>, #gpu.block<x>])
 // CHECK: transform.structured.fuse_into_containing_op
@@ -127,7 +127,7 @@
 
 // WITH_OPTIONS-LABEL: func @matmul_1
 
-// WITH_OPTIONS: transform.sequence  failures(propagate) {
+// WITH_OPTIONS: transform.named_sequence
 // WITH_OPTIONS: transform.iree.match_callback failures(propagate) "matmul"
 // Tile sizes are set by td-matmul-strategy-blk-size-XX.
 // WITH_OPTIONS: transform.structured.tile_using_forall %{{.*}} tile_sizes [256, 64](mapping = [#gpu.block<y>, #gpu.block<x>])
@@ -233,7 +233,7 @@
 
 // CHECK-LABEL: func @matmul_2
 
-// CHECK: transform.sequence  failures(propagate) {
+// CHECK: transform.named_sequence
 // CHECK: transform.iree.match_callback failures(propagate) "matmul"
 // CHECK: transform.structured.tile_using_forall %{{.*}} tile_sizes [128, 128](mapping = [#gpu.block<y>, #gpu.block<x>])
 // CHECK: transform.iree.populate_workgroup_count_region_using_num_threads_slice
@@ -287,7 +287,7 @@
 
 // CHECK-LABEL: func @matmul_3
 
-// CHECK: transform.sequence  failures(propagate) {
+// CHECK: transform.named_sequence
 
 // WITH_OPTIONS_2-LABEL: func @matmul_3
 
@@ -335,10 +335,10 @@
 // CHECK       }
 // CHECK:      transform.iree.apply_licm
 // CHECK:      transform.iree.apply_cse
-// CHECK:      %[[RES_PAD:.+]] = get_producer_of_operand %{{.*}}[2]
+// CHECK:      %[[RES_PAD:.+]] = transform.get_producer_of_operand %{{.*}}[2]
 // CHECK:      %[[RES_COPY:.+]] = transform.structured.rewrite_in_destination_passing_style %[[RES_PAD]]
-// CHECK:      %[[LHS_PAD:.+]] = get_producer_of_operand %{{.*}}[0]
-// CHECK:      %[[RHS_PAD:.+]] = get_producer_of_operand %{{.*}}[1]
+// CHECK:      %[[LHS_PAD:.+]] = transform.get_producer_of_operand %{{.*}}[0]
+// CHECK:      %[[RHS_PAD:.+]] = transform.get_producer_of_operand %{{.*}}[1]
 // CHECK:      %[[TILED_LHS:.+]], %{{.*}} = transform.structured.tile_using_forall %[[LHS_PAD]]   num_threads [32, 4](mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
 // CHECK:      transform.structured.match ops{["scf.if"]}
 // CHECK:      transform.scf.take_assumed_branch %{{.*}} take_else_branch
@@ -409,10 +409,10 @@
 // CHECK       }
 // CHECK:      transform.iree.apply_licm
 // CHECK:      transform.iree.apply_cse
-// CHECK:      %[[RES_PAD:.+]] = get_producer_of_operand %{{.*}}[2]
+// CHECK:      %[[RES_PAD:.+]] = transform.get_producer_of_operand %{{.*}}[2]
 // CHECK:      %[[RES_COPY:.+]] = transform.structured.rewrite_in_destination_passing_style %[[RES_PAD]]
-// CHECK:      %[[LHS_PAD:.+]] = get_producer_of_operand %{{.*}}[0]
-// CHECK:      %[[RHS_PAD:.+]] = get_producer_of_operand %{{.*}}[1]
+// CHECK:      %[[LHS_PAD:.+]] = transform.get_producer_of_operand %{{.*}}[0]
+// CHECK:      %[[RHS_PAD:.+]] = transform.get_producer_of_operand %{{.*}}[1]
 // CHECK:      %[[LHS_COPY:.+]] = transform.structured.rewrite_in_destination_passing_style %[[LHS_PAD]]
 // CHECK:      %[[RHS_COPY:.+]] = transform.structured.rewrite_in_destination_passing_style %[[RHS_PAD]]
 // CHECK:      transform.structured.tile_using_forall %[[LHS_COPY]]   num_threads [32, 4](mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
@@ -427,7 +427,8 @@
 // Verify we don't go down the path without the flag.
 // WITH_OPTIONS-LABEL: func @aligned_matmul
 
-// WITH_OPTIONS-NOT: transform.sequence  failures(propagate) {
+// WITH_OPTIONS-NOT: transform.sequence
+// WITH_OPTIONS-NOT: transform.named_sequence
 
 // WITH_OPTIONS_2-LABEL: func @aligned_matmul
 
@@ -472,7 +473,7 @@
 // WITH_OPTIONS_3-LABEL: func @matmul_5_small
 
 // SMALL-LABEL: func @matmul_5_small
-// SMALL: transform.sequence
+// SMALL: transform.named_sequence
 // SMALL-NOT: mma
 // SMALL-NOT: wmma
 
@@ -507,6 +508,7 @@
 // CHECK:       iree_codegen.translation_info<LLVMGPUMatmulSimt>
 // CHECK-LABEL: func @f16_matmul
 // CHECK-NOT: transform.sequence
+// CHECK-NOT: transform.named_sequence
 
 // WITH_OPTIONS_2-LABEL: func @f16_matmul
 
@@ -542,18 +544,22 @@
 }
 
 // SMALL-LABEL: func @int8_matmul
-// SMALL: transform.sequence
+// SMALL: transform.named_sequence
 // SMALL-NOT: mma
 // SMALL-NOT: wmma
 
 // CHECK-LABEL: func @int8_matmul
 // CHECK-NOT: transform.sequence
+// CHECK-NOT: transform.named_sequence
 
 // WITH_OPTIONS-LABEL: func @int8_matmul
 // WITH_OPTIONS-NOT: transform.sequence
+// WITH_OPTIONS-NOT: transform.named_sequence
 
 // WITH_OPTIONS_2-LABEL: func @int8_matmul
 // WITH_OPTIONS_2-NOT: transform.sequence
+// WITH_OPTIONS_2-NOT: transform.named_sequence
 
 // WITH_OPTIONS_3-LABEL: func @int8_matmul
 // WITH_OPTIONS_3-NOT: transform.sequence
+// WITH_OPTIONS_3-NOT: transform.named_sequence

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_pad.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_pad.mlir
index 5168e75..118c0ac 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_pad.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_pad.mlir

@@ -44,7 +44,7 @@
 }
 
 // CHECK-LABEL: func @pad
-//       CHECK:   transform.sequence  failures(propagate) {
+//       CHECK:   transform.named_sequence
 //       CHECK:   transform.iree.register_match_callbacks
 //       CHECK:   {{.*}} = transform.iree.match_callback failures(propagate) "pad"({{.*}}) : (!transform.any_op) -> !transform.any_op
 //       CHECK:   transform.structured.tile_using_forall {{.*}}   tile_sizes [64, 64](mapping = [#gpu.block<y>, #gpu.block<x>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_bufferize.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_bufferize.mlir
index 7066ae1..1769c4b 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_bufferize.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_bufferize.mlir

@@ -28,10 +28,13 @@
     }
   }
 
-  transform.sequence failures(propagate) {
-  ^bb1(%variant_op: !transform.any_op):
-    transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
-    %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op: (!transform.any_op) -> !transform.any_op
-    %func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-  }
+  builtin.module attributes { transform.with_named_sequence } {
+    transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.consumed}) {
+      transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+      %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op: (!transform.any_op) -> !transform.any_op
+      %func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+      transform.yield
+    } // @__transform_main
+  } // module
+
 }

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_bufferize_spec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_bufferize_spec.mlir
index ed73b9d..472b508 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_bufferize_spec.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_bufferize_spec.mlir

@@ -1,6 +1,9 @@
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
-  transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
-  %variant_op_3 = transform.iree.bufferize %variant_op : (!transform.any_op) -> !transform.any_op
-  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-}
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(
+      %variant_op: !transform.any_op {transform.consumed}) {
+    transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+    %variant_op_3 = transform.iree.bufferize %variant_op : (!transform.any_op) -> !transform.any_op
+    %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+} // module

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_foreach_to_gpu_spec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_foreach_to_gpu_spec.mlir
index d3df044..ebf730c 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_foreach_to_gpu_spec.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_foreach_to_gpu_spec.mlir

@@ -1,42 +1,46 @@
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
-  %0 = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %forall, %tiled_fill = transform.structured.tile_using_forall %0 num_threads [5, 1] 
-  ( mapping = [#gpu.thread<y>, #gpu.thread<x>] )
-  : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(
+      %variant_op: !transform.any_op {transform.consumed}) {
+    %0 = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    %forall, %tiled_fill = transform.structured.tile_using_forall %0 num_threads [5, 1] 
+    ( mapping = [#gpu.thread<y>, #gpu.thread<x>] )
+    : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
-  %1 = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %forall_2, %tiled_matmul = transform.structured.tile_using_forall %1 num_threads [7, 9]
-  ( mapping = [#gpu.thread<x>, #gpu.thread<y>] )
-  : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %1 = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    %forall_2, %tiled_matmul = transform.structured.tile_using_forall %1 num_threads [7, 9]
+    ( mapping = [#gpu.thread<x>, #gpu.thread<y>] )
+    : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
-  // Canonicalization/CSE is needed before bufferization otherwise unnecessary
-  // allocs will be created.
-  %func = transform.structured.match ops{["func.func"]} in %variant_op
-    : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %func {
-    transform.apply_patterns.iree.fold_fill_into_pad
-    transform.apply_patterns.linalg.tiling_canonicalization
-    transform.apply_patterns.scf.for_loop_canonicalization
-  } : !transform.any_op
-  transform.apply_patterns to %func {
-    transform.apply_patterns.tensor.reassociative_reshape_folding
-    transform.apply_patterns.canonicalization
-  } : !transform.any_op
-  transform.iree.apply_cse %func : !transform.any_op
-  transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
-  %variant_op_3 = transform.iree.bufferize %variant_op : (!transform.any_op) -> (!transform.any_op)
-  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-  transform.iree.map_nested_forall_to_gpu_threads %memref_func 
-    workgroup_dims = [10, 11, 1] : (!transform.any_op) -> ()
+    // Canonicalization/CSE is needed before bufferization otherwise unnecessary
+    // allocs will be created.
+    %func = transform.structured.match ops{["func.func"]} in %variant_op
+      : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.iree.fold_fill_into_pad
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.scf.for_loop_canonicalization
+    } : !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.tensor.reassociative_reshape_folding
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.iree.apply_cse %func : !transform.any_op
+    transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+    %variant_op_3 = transform.iree.bufferize %variant_op : (!transform.any_op) -> (!transform.any_op)
+    %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    transform.iree.map_nested_forall_to_gpu_threads %memref_func 
+      workgroup_dims = [10, 11, 1] : (!transform.any_op) -> ()
 
-  // Late canonicalizations to cleanup and pass the checks
-  transform.apply_patterns to %memref_func {
-    transform.apply_patterns.iree.fold_fill_into_pad
-    transform.apply_patterns.linalg.tiling_canonicalization
-    transform.apply_patterns.scf.for_loop_canonicalization
-    transform.apply_patterns.canonicalization
-  } : !transform.any_op
-  transform.iree.apply_licm %memref_func : !transform.any_op
-  transform.iree.apply_cse %memref_func : !transform.any_op
-}
+    // Late canonicalizations to cleanup and pass the checks
+    transform.apply_patterns to %memref_func {
+      transform.apply_patterns.iree.fold_fill_into_pad
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.scf.for_loop_canonicalization
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.iree.apply_licm %memref_func : !transform.any_op
+    transform.iree.apply_cse %memref_func : !transform.any_op
+    transform.yield 
+  }
+} // module
+

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_vector_distribution_spec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_vector_distribution_spec.mlir
index 90a2ee0..04474cd 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_vector_distribution_spec.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_vector_distribution_spec.mlir

@@ -1,23 +1,27 @@
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
-  %if_op = transform.structured.match ops{["scf.if"]} in %variant_op 
-    : (!transform.any_op) -> !transform.any_op
-  %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 }
-    : (!transform.any_op) -> !transform.any_op
-  %isolated = transform.get_parent_op %warp {isolated_from_above} 
-    : (!transform.any_op) -> !transform.any_op
-  transform.iree.vector.warp_distribute %isolated
-    : (!transform.any_op) -> ()
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
 
-  // Late canonicalizations to cleanup and pass the checks.
-  %func_op = transform.structured.match ops{["func.func"]} in %variant_op
-    : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %func_op {
-    transform.apply_patterns.iree.fold_fill_into_pad
-    transform.apply_patterns.linalg.tiling_canonicalization
-    transform.apply_patterns.scf.for_loop_canonicalization
-    transform.apply_patterns.canonicalization
-  } : !transform.any_op
-  transform.iree.apply_licm %func_op : !transform.any_op
-  transform.iree.apply_cse %func_op : !transform.any_op
-}
+    %if_op = transform.structured.match ops{["scf.if"]} in %variant_op 
+      : (!transform.any_op) -> !transform.any_op
+    %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 }
+      : (!transform.any_op) -> !transform.any_op
+    %isolated = transform.get_parent_op %warp {isolated_from_above} 
+      : (!transform.any_op) -> !transform.any_op
+    transform.iree.vector.warp_distribute %isolated
+      : (!transform.any_op) -> ()
+
+    // Late canonicalizations to cleanup and pass the checks.
+    %func_op = transform.structured.match ops{["func.func"]} in %variant_op
+      : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func_op {
+      transform.apply_patterns.iree.fold_fill_into_pad
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.scf.for_loop_canonicalization
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.iree.apply_licm %func_op : !transform.any_op
+    transform.iree.apply_cse %func_op : !transform.any_op
+
+    transform.yield 
+  }
+} // module

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_vector_warp_execute_on_lane_0_spec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_vector_warp_execute_on_lane_0_spec.mlir
index 93a8ae6..33fa29e 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_vector_warp_execute_on_lane_0_spec.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_vector_warp_execute_on_lane_0_spec.mlir

@@ -1,16 +1,19 @@
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
-  %if_op = transform.structured.match ops{["scf.if"]} in %variant_op 
-    : (!transform.any_op) -> !transform.any_op
-  transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 }
-    : (!transform.any_op) -> !transform.any_op
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
 
-  // Late canonicalizations to cleanup and pass the checks.
-  %func_op = transform.structured.match ops{["func.func"]} in %variant_op
-    : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %func_op {
-    transform.apply_patterns.canonicalization
-  } : !transform.any_op
-  transform.iree.apply_licm %func_op : !transform.any_op
-  transform.iree.apply_cse %func_op : !transform.any_op
-}
+    %if_op = transform.structured.match ops{["scf.if"]} in %variant_op 
+      : (!transform.any_op) -> !transform.any_op
+    transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 }
+      : (!transform.any_op) -> !transform.any_op
+
+    // Late canonicalizations to cleanup and pass the checks.
+    %func_op = transform.structured.match ops{["func.func"]} in %variant_op
+      : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func_op {
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.iree.apply_licm %func_op : !transform.any_op
+    transform.iree.apply_cse %func_op : !transform.any_op
+    transform.yield 
+  }
+} // module

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_eliminate_gpu_barriers.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_eliminate_gpu_barriers.mlir
index 51ce0c9..4babefb 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_eliminate_gpu_barriers.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_eliminate_gpu_barriers.mlir

@@ -18,11 +18,13 @@
   return
 }
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  transform.iree.eliminate_gpu_barriers %0 : (!transform.any_op) -> !transform.any_op
-}
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.iree.eliminate_gpu_barriers %0 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  } // @__transform_main
+} // module
 
 // -----
 
@@ -44,11 +46,13 @@
   return %2 : f32
 }
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  transform.iree.eliminate_gpu_barriers %0 : (!transform.any_op) -> !transform.any_op
-}
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.iree.eliminate_gpu_barriers %0 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  } // @__transform_main
+} // module
 
 // -----
 
@@ -66,11 +70,13 @@
   return
 }
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  transform.iree.eliminate_gpu_barriers %0 : (!transform.any_op) -> !transform.any_op
-}
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.iree.eliminate_gpu_barriers %0 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  } // @__transform_main
+} // module
 
 // -----
 
@@ -95,12 +101,13 @@
   return
 }
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  transform.iree.eliminate_gpu_barriers %0 : (!transform.any_op) -> !transform.any_op
-}
-
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.iree.eliminate_gpu_barriers %0 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  } // @__transform_main
+} // module
 
 // -----
 
@@ -128,11 +135,13 @@
   return
 }
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  transform.iree.eliminate_gpu_barriers %0 : (!transform.any_op) -> !transform.any_op
-}
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.iree.eliminate_gpu_barriers %0 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  } // @__transform_main
+} // module
 
 // -----
 
@@ -151,11 +160,13 @@
   return %0, %1 : memref<42xf32>, memref<10xf32>
 }
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  transform.iree.eliminate_gpu_barriers %0 : (!transform.any_op) -> !transform.any_op
-}
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.iree.eliminate_gpu_barriers %0 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  } // @__transform_main
+} // module
 
 // -----
 
@@ -173,11 +184,13 @@
   return %0 : memref<42xf32>
 }
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  transform.iree.eliminate_gpu_barriers %0 : (!transform.any_op) -> !transform.any_op
-}
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.iree.eliminate_gpu_barriers %0 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  } // @__transform_main
+} // module
 
 // -----
 
@@ -193,11 +206,13 @@
   return %0 : f32
 }
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  transform.iree.eliminate_gpu_barriers %0 : (!transform.any_op) -> !transform.any_op
-}
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.iree.eliminate_gpu_barriers %0 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  } // @__transform_main
+} // module
 
 // -----
 
@@ -237,8 +252,10 @@
   return %0, %1, %2, %3, %4 : f32, f32, f32, f32, f32
 }
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-  transform.iree.eliminate_gpu_barriers %0 : (!transform.any_op) -> !transform.any_op
-}
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.iree.eliminate_gpu_barriers %0 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  } // @__transform_main
+} // module

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_hoist_allocs.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_hoist_allocs.mlir
index ef85f37..c6e3281 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_hoist_allocs.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_hoist_allocs.mlir

@@ -14,12 +14,14 @@
 //  CHECK-NEXT:   ^bb1:
 //  CHECK-NEXT:   return
 
-transform.sequence failures(propagate) {
-^bb1(%module: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%module: !transform.any_op {transform.readonly}) {
     %func = transform.structured.match ops{["func.func"]} in %module
       : (!transform.any_op) -> !transform.op<"func.func">
     transform.iree.hoist_static_alloc %func : (!transform.op<"func.func">) -> ()
-}
+    transform.yield
+  } // @__transform_main
+} // module
 
 // -----
 
@@ -45,12 +47,14 @@
 //  CHECK-NEXT:   }
 //  CHECK-NEXT:   memref.dealloc %[[ALLOC]] : memref<16x16xi32>
 
-transform.sequence failures(propagate) {
-^bb1(%module: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%module: !transform.any_op {transform.readonly}) {
     %func = transform.structured.match ops{["func.func"]} in %module
       : (!transform.any_op) -> !transform.op<"func.func">
     transform.iree.hoist_static_alloc %func : (!transform.op<"func.func">) -> ()
-}
+    transform.yield
+  } // @__transform_main
+} // module
 
 // -----
 
@@ -77,9 +81,11 @@
 //  CHECK-NEXT:   }
 //  CHECK-NEXT:   memref.dealloc %[[ALLOC]] : memref<16x16xi32>
 
-transform.sequence failures(propagate) {
-^bb1(%module: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%module: !transform.any_op {transform.readonly}) {
     %func = transform.structured.match ops{["func.func"]} in %module
       : (!transform.any_op) -> !transform.op<"func.func">
     transform.iree.hoist_static_alloc %func : (!transform.op<"func.func">) -> ()
-}
+    transform.yield
+  } // @__transform_main
+} // module

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_pack_shared_memory_alloc.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_pack_shared_memory_alloc.mlir
index da0e7bc..504950e 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_pack_shared_memory_alloc.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_pack_shared_memory_alloc.mlir

@@ -25,9 +25,11 @@
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
   %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
   transform.iree.pack_shared_memory_alloc %0 : (!transform.any_op) -> ()
   transform.iree.apply_cse %0 : !transform.any_op
-}
+    transform.yield
+  } // @__transform_main
+} // module

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_promote_operands.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_promote_operands.mlir
index c36a1ff..757baf4 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_promote_operands.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_promote_operands.mlir

@@ -28,24 +28,26 @@
     }
   }
 
-  transform.sequence failures(propagate) {
-  ^bb1(%variant_op: !transform.any_op):
-    %matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op
-      : (!transform.any_op) -> !transform.any_op
-    %promoted_matmul, %alloc_0, %alloc_1 =
-      transform.iree.promote_operands %matmul [0, 1] 
-        : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+  builtin.module attributes { transform.with_named_sequence } {
+    transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
+      %matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op
+        : (!transform.any_op) -> !transform.any_op
+      %promoted_matmul, %alloc_0, %alloc_1 =
+        transform.iree.promote_operands %matmul [0, 1] 
+          : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
 
-    // Late canonicalizations to cleanup and pass the checks.
-    %func_op = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %func_op {
-      transform.apply_patterns.iree.fold_fill_into_pad
-      transform.apply_patterns.linalg.tiling_canonicalization
-      transform.apply_patterns.scf.for_loop_canonicalization
-      transform.apply_patterns.canonicalization
-    } : !transform.any_op
-    transform.iree.apply_licm %func_op : !transform.any_op
-    transform.iree.apply_cse %func_op : !transform.any_op
-  }
+      // Late canonicalizations to cleanup and pass the checks.
+      %func_op = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+      transform.apply_patterns to %func_op {
+        transform.apply_patterns.iree.fold_fill_into_pad
+        transform.apply_patterns.linalg.tiling_canonicalization
+        transform.apply_patterns.scf.for_loop_canonicalization
+        transform.apply_patterns.canonicalization
+      } : !transform.any_op
+      transform.iree.apply_licm %func_op : !transform.any_op
+      transform.iree.apply_cse %func_op : !transform.any_op
+      transform.yield
+    } // @__transform_main
+  } // module
 }
 

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_vector_distribution.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_vector_distribution.mlir
index a162fca..b1c2065 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_vector_distribution.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_vector_distribution.mlir

@@ -1,8 +1,10 @@
-// RUN: iree-opt %s --pass-pipeline="builtin.module(hal.executable(iree-transform-dialect-interpreter{transform-file-name=%p/transform_dialect_codegen_vector_warp_execute_on_lane_0_spec.mlir}))" \
+// RUN: iree-opt %s --pass-pipeline="builtin.module(hal.executable(iree-transform-dialect-interpreter))" \
+// RUN: --iree-codegen-transform-dialect-library=%p/transform_dialect_codegen_vector_warp_execute_on_lane_0_spec.mlir \
 // RUN: --allow-unregistered-dialect | \
 // RUN: FileCheck %s --check-prefix=WARP-EXECUTE
 
-// RUN: iree-opt %s --pass-pipeline="builtin.module(hal.executable(iree-transform-dialect-interpreter{transform-file-name=%p/transform_dialect_codegen_vector_distribution_spec.mlir}))" \
+// RUN: iree-opt %s --pass-pipeline="builtin.module(hal.executable(iree-transform-dialect-interpreter))" \
+// RUN: --iree-codegen-transform-dialect-library=%p/transform_dialect_codegen_vector_distribution_spec.mlir \
 // RUN: --allow-unregistered-dialect | \
 // RUN: FileCheck %s
 

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_distribute_forall.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_distribute_forall.mlir
index 818a7fe..86b4321 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_distribute_forall.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_distribute_forall.mlir

@@ -46,24 +46,24 @@
         } {mapping = [#gpu.warp<x>]}
         return
       }
-      module {
-        transform.sequence failures(propagate) {
-        ^bb0(%variant_op: !transform.any_op):
-        %17 = transform.structured.match ops{["func.func"]} in %variant_op
-          : (!transform.any_op) -> !transform.any_op
-        transform.iree.map_nested_forall_to_gpu_threads %17
-          workgroup_dims = [256, 1, 1] subgroup_size = 32 : (!transform.any_op) -> ()
+      builtin.module attributes { transform.with_named_sequence } {
+        transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
+          %17 = transform.structured.match ops{["func.func"]} in %variant_op
+            : (!transform.any_op) -> !transform.any_op
+          transform.iree.map_nested_forall_to_gpu_threads %17
+            workgroup_dims = [256, 1, 1] subgroup_size = 32 : (!transform.any_op) -> ()
 
-        // Late canonicalizations to cleanup and pass the checks.
-        // Needs to occur on the whole variant to perform cse on the workgroup_count region
-        %func_op = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func_op {
-          transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        transform.iree.apply_licm %func_op : !transform.any_op
-        transform.iree.apply_cse %func_op : !transform.any_op
-      }
+          // Late canonicalizations to cleanup and pass the checks.
+          // Needs to occur on the whole variant to perform cse on the workgroup_count region
+          %func_op = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+          transform.apply_patterns to %func_op {
+            transform.apply_patterns.canonicalization
+          } : !transform.any_op
+          transform.iree.apply_licm %func_op : !transform.any_op
+          transform.iree.apply_cse %func_op : !transform.any_op
+          transform.yield
+        } // @__transform_main
+      } // module
     }
   }
 }
-}

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_gpu_pipelining.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_gpu_pipelining.mlir
index b88f408..5bf464e 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_gpu_pipelining.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_gpu_pipelining.mlir

@@ -52,13 +52,16 @@
   return
 }
 }
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
-  %for = transform.structured.match ops{["scf.for"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %1 = transform.cast %for : !transform.any_op to !transform.op<"scf.for">
-  %2 = transform.iree.pipeline_shared_memory_copies %1 { depth = 4 } : (!transform.op<"scf.for">) -> !transform.op<"scf.for">
 }
-}
+
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
+    %for = transform.structured.match ops{["scf.for"]} in %root : (!transform.any_op) -> !transform.any_op
+    %1 = transform.cast %for : !transform.any_op to !transform.op<"scf.for">
+    %2 = transform.iree.pipeline_shared_memory_copies %1 { depth = 4 } : (!transform.op<"scf.for">) -> !transform.op<"scf.for">
+    transform.yield
+  } // @__transform_main
+} // module
 
 // CHECK-LABEL: func.func @matmul_pipelining
 // CHECK: nvgpu.device_async_copy

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_vector_to_mma.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_vector_to_mma.mlir
index 6180d5f..6bfe345 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_vector_to_mma.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_vector_to_mma.mlir

@@ -47,22 +47,25 @@
   return
 }
 }
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
-  %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %func {
-    transform.apply_patterns.iree.unroll_vectors_gpu_wmma_sync
-  } : !transform.any_op
-  transform.iree.vector.vector_to_mma_conversion %func { use_wmma } : (!transform.any_op) -> ()
+}
 
-  // Apply canonicalization post-hoc to trigger DCE and pass the test 
-  // (i.e. all vector.contract are dead).
-  // TODO: consider having the vector_to_mma_conversion do the DCE automatically.
-  transform.apply_patterns to %func {
-    transform.apply_patterns.canonicalization
-  } : !transform.any_op
-}
-}
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.iree.unroll_vectors_gpu_wmma_sync
+    } : !transform.any_op
+    transform.iree.vector.vector_to_mma_conversion %func { use_wmma } : (!transform.any_op) -> ()
+
+    // Apply canonicalization post-hoc to trigger DCE and pass the test 
+    // (i.e. all vector.contract are dead).
+    // TODO: consider having the vector_to_mma_conversion do the DCE automatically.
+    transform.apply_patterns to %func {
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.yield
+  } // @__transform_main
+} // module
 
 // -----
 
@@ -128,8 +131,9 @@
   return
 }
 }
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
+
+builtin.module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
   %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
   transform.apply_patterns to %func {
     transform.apply_patterns.iree.unroll_vectors_gpu_wmma_sync
@@ -138,5 +142,7 @@
   transform.apply_patterns to %func {
     transform.apply_patterns.canonicalization
   } : !transform.any_op
-}
+    transform.yield
+  } // @__transform_main
+} // module
 }

diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/SPIRV/test/BUILD.bazel
index f6c559f..d183c2e 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/BUILD.bazel

@@ -6,8 +6,8 @@
 
 # Tests for common transforms.
 
-load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
 load("//build_tools/bazel:enforce_glob.bzl", "enforce_glob")
+load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
 
 package(
     features = ["layering_check"],

diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/set_transform_strategy.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/set_transform_strategy.mlir
index 300398f..1aa48d0 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/set_transform_strategy.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/set_transform_strategy.mlir

@@ -45,7 +45,7 @@
 
 // CHECK-LABEL: func @matmul
 
-// CHECK: transform.sequence  failures(propagate) {
+// CHECK: transform.named_sequence
 
 /// The specific vector sizes are tested in the LLVMGPU tests and thus omitted
 /// here. This is just to check that masked vectorization is used.

diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/Common.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/Common.cpp
index fea61a8..4a122f0 100644
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/Common.cpp
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/Common.cpp

@@ -9,6 +9,7 @@
 #include "iree-dialects/Dialect/LinalgTransform/StructuredTransformOpsExt.h"
 #include "iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.h"
 #include "iree/compiler/Codegen/TransformStrategies/Common/AbstractReductionStrategy.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
@@ -37,8 +38,8 @@
 using transform::MatchOp;
 using transform::MemRefEraseDeadAllocAndStoresOp;
 using transform::MergeHandlesOp;
+using transform::NamedSequenceOp;
 using transform::PrintOp;
-using transform::SequenceOp;
 using transform::SplitHandleOp;
 using transform::SplitReductionOp;
 using transform::TileUsingForallOp;
@@ -96,17 +97,25 @@
   OpBuilder b(ctx);
   b.setInsertionPointAfter(entryPoint);
   auto topLevelTransformModule = b.create<ModuleOp>(loc);
+  topLevelTransformModule->setAttr(
+      transform::TransformDialect::kWithNamedSequenceAttrName, b.getUnitAttr());
   Region &topLevelTransformRegion = topLevelTransformModule.getBodyRegion();
   b.setInsertionPointToStart(&topLevelTransformRegion.front());
   auto anyOpType = transform::AnyOpType::get(b.getContext());
-  auto sequence = b.create<transform::SequenceOp>(
-      loc, TypeRange{}, transform::FailurePropagationMode::Propagate, anyOpType,
-      [&](OpBuilder &b, Location loc, Value variantH) {
+  auto sequence = b.create<transform::NamedSequenceOp>(
+      loc,
+      /*symName=*/
+      std::string(
+          transform::TransformDialect::kTransformEntryPointSymbolName.str()),
+      /*rootType*/ anyOpType,
+      /*resultTypes=*/TypeRange{},
+      /*bodyBuilder=*/[&](OpBuilder &b, Location loc, Value variantH) {
         ImplicitLocOpBuilder ib(loc, b);
         buildStrategy(ib, variantH);
         b.create<transform::YieldOp>(loc);
       });
   (void)sequence;
+
   LDBG("transformation script:\n");
   LDBG("verification: " << sequence.verify().succeeded() << "\n");
 }

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/BUILD.bazel b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/BUILD.bazel
index ef00e97..8992c5f 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/BUILD.bazel

@@ -4,8 +4,8 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
 load("//build_tools/bazel:enforce_glob.bzl", "enforce_glob")
+load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
 
 package(
     features = ["layering_check"],

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/transform_dispatch_region_formation.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/transform_dispatch_region_formation.mlir
index 369a270..af63269 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/transform_dispatch_region_formation.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/transform_dispatch_region_formation.mlir

@@ -13,11 +13,13 @@
   return %0 : tensor<?x?xf32>
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["tensor.extract_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.iree.wrap_in_dispatch_region %0 { generateWorkload = false } : (!transform.any_op) -> !transform.any_op
-}
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["tensor.extract_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.iree.wrap_in_dispatch_region %0 { generateWorkload = false } : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  } // @__transform_main
+} // module
 
 // -----
 
@@ -41,13 +43,15 @@
   return %0, %1 : tensor<?x?xf32>, tensor<?x?xf32>
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %dispatch_op = transform.iree.wrap_in_dispatch_region %0 { generateWorkload = false } : (!transform.any_op) -> !transform.any_op
-  %1 = transform.structured.match ops{["test.dummy"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.iree.clone_preceding_op_into_dispatch_region %1 into %dispatch_op : (!transform.any_op, !transform.any_op) -> !transform.any_op
-}
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %dispatch_op = transform.iree.wrap_in_dispatch_region %0 { generateWorkload = false } : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.match ops{["test.dummy"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.iree.clone_preceding_op_into_dispatch_region %1 into %dispatch_op : (!transform.any_op, !transform.any_op) -> !transform.any_op
+    transform.yield
+  } // @__transform_main
+} // module
 
 // -----
 
@@ -71,13 +75,15 @@
   return %1, %0 : tensor<?x?xf32>, tensor<?x?xf32>
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %dispatch_op = transform.iree.wrap_in_dispatch_region %0 { generateWorkload = false } : (!transform.any_op) -> !transform.any_op
-  %1 = transform.structured.match ops{["tensor.extract_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.iree.move_preceding_op_into_dispatch_region %1 into %dispatch_op : (!transform.any_op, !transform.any_op) -> !transform.any_op
-}
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %dispatch_op = transform.iree.wrap_in_dispatch_region %0 { generateWorkload = false } : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.match ops{["tensor.extract_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.iree.move_preceding_op_into_dispatch_region %1 into %dispatch_op : (!transform.any_op, !transform.any_op) -> !transform.any_op
+    transform.yield
+  } // @__transform_main
+} // module
 
 // -----
 
@@ -95,12 +101,14 @@
   return %matmul : tensor<5x5xf32>
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %region_op = transform.iree.wrap_in_dispatch_region %0 { generateWorkload = false } : (!transform.any_op) -> !transform.any_op
-  transform.iree.region_to_workgroups %region_op : (!transform.any_op) -> !transform.any_op
-}
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %region_op = transform.iree.wrap_in_dispatch_region %0 { generateWorkload = false } : (!transform.any_op) -> !transform.any_op
+    transform.iree.region_to_workgroups %region_op : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  } // @__transform_main
+} // module
 
 // -----
 
@@ -127,13 +135,15 @@
   return %5 : tensor<?x?xf32>
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %dispatch_op = transform.iree.wrap_in_dispatch_region %0 { generateWorkload = false } : (!transform.any_op) -> !transform.any_op
-  %1 = transform.structured.match attributes{"__tagged__"} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.iree.clone_preceding_op_into_dispatch_region %1 into %dispatch_op : (!transform.any_op, !transform.any_op) -> !transform.any_op
-}
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %dispatch_op = transform.iree.wrap_in_dispatch_region %0 { generateWorkload = false } : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.match attributes{"__tagged__"} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.iree.clone_preceding_op_into_dispatch_region %1 into %dispatch_op : (!transform.any_op, !transform.any_op) -> !transform.any_op
+    transform.yield
+  } // @__transform_main
+} // module
 
 // -----
 
@@ -157,13 +167,15 @@
   return %1, %0 : tensor<?x?xf32>, tensor<?x?xf32>
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["tensor.extract_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %dispatch_op = transform.iree.wrap_in_dispatch_region %0 { generateWorkload = false } : (!transform.any_op) -> !transform.any_op
-  %1 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.iree.move_succeeding_op_into_dispatch_region %1 into %dispatch_op : (!transform.any_op, !transform.any_op) -> !transform.any_op
-}
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["tensor.extract_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %dispatch_op = transform.iree.wrap_in_dispatch_region %0 { generateWorkload = false } : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.iree.move_succeeding_op_into_dispatch_region %1 into %dispatch_op : (!transform.any_op, !transform.any_op) -> !transform.any_op
+    transform.yield
+  } // @__transform_main
+} // module
 
 // -----
 
@@ -190,13 +202,15 @@
   return %5, %u : tensor<600x700xf32>, tensor<50x90xf32>
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["test.dummy_op"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %dispatch_op = transform.iree.wrap_in_dispatch_region %0  {generateWorkload=false} : (!transform.any_op) -> !transform.any_op
-  %1 = transform.structured.match attributes{"__tagged__"} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.iree.move_succeeding_op_into_dispatch_region %1 into %dispatch_op : (!transform.any_op, !transform.any_op) -> !transform.any_op
-}
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["test.dummy_op"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %dispatch_op = transform.iree.wrap_in_dispatch_region %0  {generateWorkload=false} : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.match attributes{"__tagged__"} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.iree.move_succeeding_op_into_dispatch_region %1 into %dispatch_op : (!transform.any_op, !transform.any_op) -> !transform.any_op
+    transform.yield
+  } // @__transform_main
+} // module
 
 // -----
 
@@ -217,13 +231,15 @@
   return %1, %0 : tensor<?x?xf32>, tensor<?x?xf32>
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["tensor.extract_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %dispatch_op = transform.iree.wrap_in_dispatch_region %0  {generateWorkload=false} : (!transform.any_op) -> !transform.any_op
-  %1 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.iree.clone_succeeding_op_into_dispatch_region %1 into %dispatch_op : (!transform.any_op, !transform.any_op) -> !transform.any_op
-}
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["tensor.extract_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %dispatch_op = transform.iree.wrap_in_dispatch_region %0  {generateWorkload=false} : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.iree.clone_succeeding_op_into_dispatch_region %1 into %dispatch_op : (!transform.any_op, !transform.any_op) -> !transform.any_op
+    transform.yield
+  } // @__transform_main
+} // module
 
 // -----
 
@@ -250,8 +266,10 @@
   return %1 : tensor<4x?xf32>
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %dispatch_op = transform.iree.wrap_in_dispatch_region %0 { generateWorkload = false } : (!transform.any_op) -> !transform.any_op
-}
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %dispatch_op = transform.iree.wrap_in_dispatch_region %0 { generateWorkload = false } : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  } // @__transform_main
+} // module

diff --git a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/BUILD.bazel b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/BUILD.bazel
index 64493c0..a8bb9f3 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/BUILD.bazel

@@ -4,8 +4,8 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
 load("//build_tools/bazel:enforce_glob.bzl", "enforce_glob")
+load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
 
 package(
     features = ["layering_check"],

diff --git a/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/BUILD.bazel b/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/BUILD.bazel
index d9b760a..fc34bff 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/BUILD.bazel

@@ -4,8 +4,8 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
 load("//build_tools/bazel:enforce_glob.bzl", "enforce_glob")
+load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
 
 package(
     features = ["layering_check"],

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/BUILD.bazel b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/BUILD.bazel
index 965b61f..1f2104a 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/BUILD.bazel

@@ -4,8 +4,8 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
 load("//build_tools/bazel:enforce_glob.bzl", "enforce_glob")
+load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
 
 package(
     features = ["layering_check"],

diff --git a/compiler/src/iree/compiler/GlobalOptimization/test/BUILD.bazel b/compiler/src/iree/compiler/GlobalOptimization/test/BUILD.bazel
index fa48db2..8d84f04 100644
--- a/compiler/src/iree/compiler/GlobalOptimization/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/GlobalOptimization/test/BUILD.bazel

@@ -4,8 +4,8 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
 load("//build_tools/bazel:enforce_glob.bzl", "enforce_glob")
+load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
 
 package(
     features = ["layering_check"],

diff --git a/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/test/BUILD.bazel b/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/test/BUILD.bazel
index 09466b8..4c424a9 100644
--- a/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/test/BUILD.bazel

@@ -4,8 +4,8 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
 load("//build_tools/bazel:enforce_glob.bzl", "enforce_glob")
+load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
 
 package(
     features = ["layering_check"],

diff --git a/runtime/src/iree/base/tracing/BUILD.bazel b/runtime/src/iree/base/tracing/BUILD.bazel
index 680b97a..6870606 100644
--- a/runtime/src/iree/base/tracing/BUILD.bazel
+++ b/runtime/src/iree/base/tracing/BUILD.bazel

@@ -4,8 +4,8 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-load("//build_tools/bazel:build_defs.oss.bzl", "iree_runtime_cc_library")
 load("@bazel_skylib//rules:common_settings.bzl", "string_flag")
+load("//build_tools/bazel:build_defs.oss.bzl", "iree_runtime_cc_library")
 
 package(
     default_visibility = ["//visibility:public"],

diff --git a/samples/transform_dialect/example_module.mlir b/samples/transform_dialect/example_module.mlir
index 2b9275a..c5eab46 100644
--- a/samples/transform_dialect/example_module.mlir
+++ b/samples/transform_dialect/example_module.mlir

@@ -107,13 +107,13 @@
 }
 
 /// We test first with threading off so that the printers are legible.
-// RUN: iree-compile %s --iree-hal-target-backends=vulkan \
-// RUN:   --iree-codegen-use-transform-dialect-strategy=@transform_main \
-// RUN:   --iree-codegen-transform-dialect-library=%p/transform_library.mlir \
-// RUN:   --compile-from=executable-sources \
-// RUN:   --compile-to=executable-targets \
-// RUN:   --mlir-disable-threading | \
-// RUN: FileCheck %s --check-prefixes=CODEGEN-PRINTER
+// R-UN: iree-compile %s --iree-hal-target-backends=vulkan \
+// R-UN:   --iree-codegen-use-transform-dialect-strategy=transform_main \
+// R-UN:   --iree-codegen-transform-dialect-library=%p/transform_library.mlir \
+// R-UN:   --compile-from=executable-sources \
+// R-UN:   --compile-to=executable-targets \
+// R-UN:   --mlir-disable-threading | \
+// R-UN: FileCheck %s --check-prefixes=CODEGEN-PRINTER
 
 // CODEGEN-PRINTER:     IR printer: Setting matmul strategy to default top-level
 // CODEGEN-PRINTER:       translation_info = #iree_codegen.translation_info<TransformDialectCodegen codegen_spec = @transform_main

diff --git a/tests/transform_dialect/cpu/attention.mlir b/tests/transform_dialect/cpu/attention.mlir
index 00591b1..9dd587f 100644
--- a/tests/transform_dialect/cpu/attention.mlir
+++ b/tests/transform_dialect/cpu/attention.mlir

@@ -9,7 +9,8 @@
 }
 
 // RUN: iree-compile %s --iree-hal-target-backends=llvm-cpu \
-// RUN: --iree-codegen-use-transform-dialect-strategy=%p/attention_codegen_spec.mlir | \
+// RUN: --iree-codegen-transform-dialect-library=%p/attention_codegen_spec.mlir \
+// RUN: --iree-codegen-use-transform-dialect-strategy=codegen | \
 // RUN: iree-run-module --module=- --function=attention | \
 // RUN: FileCheck %s --check-prefixes=EXEC
 

diff --git a/tests/transform_dialect/cpu/attention_codegen_spec.mlir b/tests/transform_dialect/cpu/attention_codegen_spec.mlir
index b1e4315..f31f50e 100644
--- a/tests/transform_dialect/cpu/attention_codegen_spec.mlir
+++ b/tests/transform_dialect/cpu/attention_codegen_spec.mlir

@@ -1,5 +1,8 @@
-transform.sequence failures(propagate) {
-  ^bb0(%variant_op: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+
+  // Codegen.
+  transform.named_sequence @codegen(
+      %variant_op: !transform.any_op {transform.consumed}) {
 
     // Get attention op
     // ==========================================
@@ -18,7 +21,7 @@
     // ==========================================
     %attention2 = transform.structured.match ops{["iree_linalg_ext.attention"]} in %variant_op : (!transform.any_op) -> !transform.any_op
     %acc_fill, %max_fill, %sum_fill, %inner_loop, %fill_op, %first_matmul, %reduce_max, %partial_softmax, %update, %reduce_sum,
-    %reciprocal_sum, %softmax, %scale_acc, %second_matmul = tile_and_decompose_attention %attention2 :
+    %reciprocal_sum, %softmax, %scale_acc, %second_matmul = transform.tile_and_decompose_attention %attention2 :
        (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op,!transform.any_op,  !transform.any_op, !transform.any_op)
 
     // Vectorize function
@@ -58,4 +61,22 @@
     } : !transform.any_op
     transform.iree.apply_cse %func_8 : !transform.any_op
     transform.memref.erase_dead_alloc_and_stores %func_8 : (!transform.any_op) -> ()
-}
+    transform.yield
+  } // codegen
+  
+  // Find `hal.executable.variant`.
+  transform.named_sequence @match_variant_for_codegen(%root: !transform.any_op {transform.readonly}) 
+    -> !transform.any_op {
+    transform.match.operation_name %root ["hal.executable.variant"] : !transform.any_op
+    transform.yield %root : !transform.any_op
+  }
+
+  // Transform entry-point
+  transform.named_sequence @__transform_main(%root: !transform.any_op {transform.consumed}) {
+    transform.foreach_match in %root
+        @match_variant_for_codegen -> @codegen
+      : (!transform.any_op) -> (!transform.any_op)
+    transform.yield 
+  }
+} // module
+

diff --git a/tests/transform_dialect/cpu/contraction-packing-and-dispatch.mlir b/tests/transform_dialect/cpu/contraction-packing-and-dispatch.mlir
index c041d46..dbafce8 100644
--- a/tests/transform_dialect/cpu/contraction-packing-and-dispatch.mlir
+++ b/tests/transform_dialect/cpu/contraction-packing-and-dispatch.mlir

@@ -9,13 +9,6 @@
 // RUN:     --iree-hal-configuration-pipeline | \
 // RUN: FileCheck %s
 
-// Check that compilation runs all the way to the end.
-// TODO: this currently fails with:
-//   'memref.alloca' op all stack allocations need to be hoisted to the entry block of the function
-//
-// R-UN: iree-opt %s --iree-transform-dialect-interpreter --transform-dialect-drop-schedule | \
-// R-UN: iree-compile --iree-hal-target-backends=llvm-cpu
-
 !a_tensor_t = tensor<1234x567xf32>
 !b_tensor_t = tensor<567x890xf32>
 !c_tensor_t = tensor<1234x890xf32>
@@ -57,14 +50,15 @@
 
 // CHECK-LABEL: func.func @matmul_dispatch_4
 //       CHECK:   tensor.unpack
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) {
+    %matmul = transform.structured.match interface{LinalgOp} in %module_op
+      : (!transform.any_op) -> (!transform.any_op)
 
-transform.sequence failures(propagate) {
-^bb1(%module_op: !transform.any_op):
-  %matmul = transform.structured.match interface{LinalgOp} in %module_op
-    : (!transform.any_op) -> (!transform.any_op)
-
-  transform.structured.pack_greedily %matmul
-      matmul_packed_sizes = [8, 16, 32] 
-      matmul_inner_dims_order = [0, 1, 2]
-    : (!transform.any_op) -> !transform.op<"linalg.generic">
-}
+    transform.structured.pack_greedily %matmul
+        matmul_packed_sizes = [8, 16, 32]
+        matmul_inner_dims_order = [0, 1, 2]
+      : (!transform.any_op) -> !transform.op<"linalg.generic">
+    transform.yield 
+  }
+} // module

diff --git a/tests/transform_dialect/cpu/contraction-packing.mlir b/tests/transform_dialect/cpu/contraction-packing.mlir
index 2ab912f..d103303 100644
--- a/tests/transform_dialect/cpu/contraction-packing.mlir
+++ b/tests/transform_dialect/cpu/contraction-packing.mlir

@@ -136,16 +136,19 @@
   return %0 : !ct_tensor_t
 }
 
-transform.sequence failures(propagate) {
-^bb1(%module_op: !transform.any_op):
-  %matmul = transform.structured.match interface{LinalgOp} in %module_op
-    : (!transform.any_op) -> (!transform.any_op)
-  
-  // Generalized packing rewrite extracts a gemm from any linalg op that contains 
-  // one. This acts as a powerful normalization step: after this point, we have a
-  // gemm (i.e. 3-D contraction with (m,n,k)=(8,16,32) ) on the 3 most minor
-  // dimensions.
-  transform.structured.pack_greedily %matmul
-      matmul_packed_sizes = [8, 16, 32] matmul_inner_dims_order = [0, 1, 2]
-    : (!transform.any_op) -> !transform.op<"linalg.generic">
-}
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) {
+    %matmul = transform.structured.match interface{LinalgOp} in %module_op
+      : (!transform.any_op) -> (!transform.any_op)
+    
+    // Generalized packing rewrite extracts a gemm from any linalg op that contains 
+    // one. This acts as a powerful normalization step: after this point, we have a
+    // gemm (i.e. 3-D contraction with (m,n,k)=(8,16,32) ) on the 3 most minor
+    // dimensions.
+    transform.structured.pack_greedily %matmul
+        matmul_packed_sizes = [8, 16, 32] matmul_inner_dims_order = [0, 1, 2]
+      : (!transform.any_op) -> !transform.op<"linalg.generic">
+    transform.yield 
+  }
+} // module
+

diff --git a/tests/transform_dialect/cpu/fold_tensor_slice_into_transfer.mlir b/tests/transform_dialect/cpu/fold_tensor_slice_into_transfer.mlir
index 8e10591..18c675a 100644
--- a/tests/transform_dialect/cpu/fold_tensor_slice_into_transfer.mlir
+++ b/tests/transform_dialect/cpu/fold_tensor_slice_into_transfer.mlir

@@ -4,13 +4,6 @@
 // added to IREE in https://github.com/openxla/iree/pull/14373, as a workaround
 // for other patterns being sensitive to these exact transforms.
 
-transform.sequence failures(propagate) {
-^bb1(%func_op: !transform.op<"func.func">):
-  transform.apply_patterns to %func_op {
-    transform.apply_patterns.iree.fold_tensor_slice_into_transfer
-  } : !transform.op<"func.func">
-}
-
 // CHECK-LABEL: func @transfer_read_of_extract_slice(
 //  CHECK-SAME:     %[[t:.*]]: tensor<?x?xf32>, %[[s1:.*]]: index, %[[s2:.*]]: index
 //   CHECK-DAG:   %[[c4:.*]] = arith.constant 4 : index
@@ -106,3 +99,12 @@
   %1 = tensor.insert_slice %0 into %t1[4, 3, %s] [1, 5, 6] [1, 1, 1] : tensor<5x6xf32> into tensor<?x?x12xf32>
   return %1 : tensor<?x?x12xf32>
 }
+
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%func_op: !transform.op<"func.func"> {transform.readonly}) {
+    transform.apply_patterns to %func_op {
+      transform.apply_patterns.iree.fold_tensor_slice_into_transfer
+    } : !transform.op<"func.func">
+    transform.yield 
+  }
+} // module

diff --git a/tests/transform_dialect/cpu/matmul.mlir b/tests/transform_dialect/cpu/matmul.mlir
index 8bc4c7f..63d059e 100644
--- a/tests/transform_dialect/cpu/matmul.mlir
+++ b/tests/transform_dialect/cpu/matmul.mlir

@@ -16,7 +16,8 @@
 // RUN:   --iree-stream-transformation-pipeline \
 // RUN:   --iree-hal-configuration-pipeline | \
 // RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmcpu-lower-executable-target)))' \
-// RUN:   --iree-codegen-use-transform-dialect-strategy=%p/matmul_codegen_default_spec.mlir | \
+// RUN:   --iree-codegen-transform-dialect-library=%p/matmul_codegen_default_spec.mlir \
+// RUN:   --iree-codegen-use-transform-dialect-strategy=codegen | \
 // RUN: FileCheck %s --check-prefixes=CODEGEN-DEFAULT
 
 // CODEGEN-DEFAULT:     hal.executable.export public @matmul_static_dispatch_0_matmul_3x3x5

diff --git a/tests/transform_dialect/cpu/matmul_codegen_default_spec.mlir b/tests/transform_dialect/cpu/matmul_codegen_default_spec.mlir
index b84a4ba..df5231d 100644
--- a/tests/transform_dialect/cpu/matmul_codegen_default_spec.mlir
+++ b/tests/transform_dialect/cpu/matmul_codegen_default_spec.mlir

@@ -1,24 +1,27 @@
 // RUN: iree-opt %s
 
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
-  %matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @codegen(
+      %variant_op: !transform.any_op {transform.consumed}) {
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op
 
-  // Step 1. Tile to forall with tile_sizes [2].
-  // ===================================================
-  %tiled_generic, %forall =
-    transform.structured.tile_using_forall %matmul tile_sizes [2]
-      ( mapping = [#gpu.block<x>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall
-    : (!transform.any_op) -> ()
+    // Step 1. Tile to forall with tile_sizes [2].
+    // ===================================================
+    %tiled_generic, %forall =
+      transform.structured.tile_using_forall %matmul tile_sizes [2]
+        ( mapping = [#gpu.block<x>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall
+      : (!transform.any_op) -> ()
 
-  // Step 2. Bufferize and drop HAL decriptor from memref ops.
-  // =========================================================
-  transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
-  %variant_op_3 = transform.iree.bufferize %variant_op : (!transform.any_op) -> !transform.any_op
+    // Step 2. Bufferize and drop HAL decriptor from memref ops.
+    // =========================================================
+    transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+    %variant_op_3 = transform.iree.bufferize %variant_op : (!transform.any_op) -> !transform.any_op
 
-  // Step 3. Post-bufferization mapping workgroup.
-  // =========================================================
-  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-  transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> ()
-}
+    // Step 3. Post-bufferization mapping workgroup.
+    // =========================================================
+    %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> ()
+    transform.yield 
+  }
+} // module

diff --git a/tests/transform_dialect/cpu/matmul_library_call.mlir b/tests/transform_dialect/cpu/matmul_library_call.mlir
index 5dd24db..211a598 100644
--- a/tests/transform_dialect/cpu/matmul_library_call.mlir
+++ b/tests/transform_dialect/cpu/matmul_library_call.mlir

@@ -13,7 +13,7 @@
 }
 
 // RUN: iree-compile %s --iree-hal-target-backends=llvm-cpu \
-// RUN:   --iree-codegen-use-transform-dialect-strategy=@custom_matmul \
+// RUN:   --iree-codegen-use-transform-dialect-strategy=custom_matmul \
 // RUN:   --iree-codegen-transform-dialect-library=%p/transform_library.mlir \
 // RUN:   --compile-to=executable-targets | \
 // RUN: FileCheck %s --check-prefixes=CODEGEN-DEFAULT
@@ -25,7 +25,7 @@
 
 // RUN: iree-compile %s --iree-hal-target-backends=llvm-cpu \
 // RUN:   --iree-codegen-transform-dialect-library=%p/transform_library.mlir \
-// RUN:   --iree-codegen-use-transform-dialect-strategy=@custom_matmul | \
+// RUN:   --iree-codegen-use-transform-dialect-strategy=custom_matmul | \
 // RUN: iree-run-module --module=- --function=matmul_static \
 // RUN:   --input="3x5xf32=1" \
 // RUN:   --input="5x3xf32=2" \

diff --git a/tests/transform_dialect/cuda/BUILD.bazel b/tests/transform_dialect/cuda/BUILD.bazel
index 8615fab..a7903fa 100644
--- a/tests/transform_dialect/cuda/BUILD.bazel
+++ b/tests/transform_dialect/cuda/BUILD.bazel

@@ -35,7 +35,6 @@
         "softmax_v2.mlir",
         # First few ops of softmax only, acts as a proxy example.
         "softmax_partial.mlir",
-        "vecadd2d.mlir",
     ],
     cfg = "//tests:lit.cfg.py",
     # transform dialect spec files are MLIR files that specify a transformation,
@@ -57,8 +56,6 @@
         "softmax_dispatch_spec.mlir",
         # First few ops of softmax only, acts as a proxy example.
         "softmax_partial_codegen_spec.mlir",
-        "vecadd2d_codegen_spec.mlir",
-        "vecadd2d_codegen_spec_partial_tile.mlir",
     ],
     tags = [
         # CUDA cuInit fails with sanitizer on.

diff --git a/tests/transform_dialect/cuda/CMakeLists.txt b/tests/transform_dialect/cuda/CMakeLists.txt
index b2d4d1d..ab23d81 100644
--- a/tests/transform_dialect/cuda/CMakeLists.txt
+++ b/tests/transform_dialect/cuda/CMakeLists.txt

@@ -26,7 +26,6 @@
     "softmax.mlir"
     "softmax_partial.mlir"
     "softmax_v2.mlir"
-    "vecadd2d.mlir"
   TOOLS
     FileCheck
     iree-compile
@@ -43,8 +42,6 @@
     softmax_dispatch_spec.mlir
     softmax_partial_codegen_spec.mlir
     softmax_v2_codegen_spec.mlir
-    vecadd2d_codegen_spec.mlir
-    vecadd2d_codegen_spec_partial_tile.mlir
   LABELS
     "noasan"
     "nomsan"

diff --git a/tests/transform_dialect/cuda/double_mma_layout_analysis.mlir b/tests/transform_dialect/cuda/double_mma_layout_analysis.mlir
index 93b143c..cb53367 100644
--- a/tests/transform_dialect/cuda/double_mma_layout_analysis.mlir
+++ b/tests/transform_dialect/cuda/double_mma_layout_analysis.mlir

@@ -15,7 +15,8 @@
 // RUN:     --iree-hal-cuda-llvm-target-arch=sm_80 \
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
 // RUN:     --iree-flow-dispatch-use-transform-dialect=%p/double_mma_layout_analysis_dispatch_spec.mlir \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=%p/double_mma_layout_analysis_codegen_spec.mlir | \
+// RUN:     --iree-codegen-transform-dialect-library=%p/double_mma_layout_analysis_codegen_spec.mlir \
+// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
 // RUN: iree-run-module --module=- --function=double_matmul --device=cuda \
 // RUN: --input="16x16xf16=[[0.0999755859375,0.2249755859375,0.07501220703125,0.0,0.07501220703125,0.2249755859375,0.175048828125,0.07501220703125,0.175048828125,0.07501220703125,0.024993896484375,0.1500244140625,0.1500244140625,0.2249755859375,0.199951171875,0.1500244140625],[0.1500244140625,0.199951171875,0.0999755859375,0.07501220703125,0.1500244140625,0.2249755859375,0.024993896484375,0.0999755859375,0.0999755859375,0.024993896484375,0.2249755859375,0.2249755859375,0.2249755859375,0.0,0.024993896484375,0.04998779296875],[0.07501220703125,0.0,0.125,0.125,0.04998779296875,0.2249755859375,0.024993896484375,0.199951171875,0.199951171875,0.07501220703125,0.1500244140625,0.2249755859375,0.024993896484375,0.175048828125,0.07501220703125,0.125],[0.04998779296875,0.024993896484375,0.0,0.2249755859375,0.07501220703125,0.024993896484375,0.024993896484375,0.0,0.07501220703125,0.1500244140625,0.1500244140625,0.175048828125,0.2249755859375,0.1500244140625,0.07501220703125,0.0999755859375],[0.125,0.0,0.199951171875,0.04998779296875,0.199951171875,0.04998779296875,0.175048828125,0.125,0.0,0.0,0.199951171875,0.024993896484375,0.2249755859375,0.1500244140625,0.024993896484375,0.0],[0.04998779296875,0.2249755859375,0.0999755859375,0.07501220703125,0.2249755859375,0.07501220703125,0.2249755859375,0.07501220703125,0.2249755859375,0.199951171875,0.125,0.07501220703125,0.04998779296875,0.199951171875,0.125,0.1500244140625],[0.1500244140625,0.125,0.175048828125,0.04998779296875,0.125,0.1500244140625,0.1500244140625,0.125,0.0999755859375,0.0,0.199951171875,0.024993896484375,0.175048828125,0.199951171875,0.125,0.0999755859375],[0.0999755859375,0.199951171875,0.0999755859375,0.0999755859375,0.2249755859375,0.0,0.175048828125,0.0999755859375,0.125,0.07501220703125,0.07501220703125,0.175048828125,0.07501220703125,0.0,0.2249755859375,0.2249755859375],[0.07501220703125,0.024993896484375,0.199951171875,0.024993896484375,0.175048828125,0.199951171875,0.0999755859375,0.024993896484375,0.0,0.0999755859375,0.0,0.0999755859375,0.2249755859375,0.175048828125,0.0,0.0],[0.024993896484375,0.0999755859375,0.2249755859375,0.2249755859375,0.125,0.2249755859375,0.04998779296875,0.04998779296875,0.04998779296875,0.024993896484375,0.0999755859375,0.2249755859375,0.024993896484375,0.024993896484375,0.0,0.07501220703125],[0.0,0.1500244140625,0.175048828125,0.1500244140625,0.2249755859375,0.024993896484375,0.1500244140625,0.0999755859375,0.024993896484375,0.0,0.125,0.04998779296875,0.125,0.199951171875,0.024993896484375,0.199951171875],[0.024993896484375,0.04998779296875,0.199951171875,0.0,0.07501220703125,0.199951171875,0.2249755859375,0.04998779296875,0.175048828125,0.0,0.199951171875,0.199951171875,0.1500244140625,0.199951171875,0.125,0.199951171875],[0.1500244140625,0.125,0.04998779296875,0.0999755859375,0.04998779296875,0.175048828125,0.04998779296875,0.0999755859375,0.2249755859375,0.199951171875,0.125,0.1500244140625,0.0999755859375,0.07501220703125,0.07501220703125,0.0999755859375],[0.0,0.04998779296875,0.125,0.024993896484375,0.04998779296875,0.199951171875,0.04998779296875,0.0999755859375,0.199951171875,0.07501220703125,0.1500244140625,0.125,0.199951171875,0.199951171875,0.0,0.125],[0.024993896484375,0.07501220703125,0.0,0.199951171875,0.024993896484375,0.024993896484375,0.024993896484375,0.175048828125,0.04998779296875,0.04998779296875,0.04998779296875,0.07501220703125,0.07501220703125,0.1500244140625,0.175048828125,0.199951171875],[0.0,0.125,0.0,0.07501220703125,0.125,0.125,0.07501220703125,0.1500244140625,0.04998779296875,0.04998779296875,0.125,0.125,0.2249755859375,0.0999755859375,0.07501220703125,0.07501220703125]]" \
 // RUN: --input="16x16xf16=[[0.175048828125,0.07501220703125,0.199951171875,0.0,0.175048828125,0.125,0.199951171875,0.04998779296875,0.0999755859375,0.175048828125,0.07501220703125,0.04998779296875,0.125,0.125,0.07501220703125,0.2249755859375],[0.024993896484375,0.199951171875,0.0,0.1500244140625,0.175048828125,0.0999755859375,0.175048828125,0.1500244140625,0.2249755859375,0.07501220703125,0.199951171875,0.0999755859375,0.0999755859375,0.2249755859375,0.0999755859375,0.0999755859375],[0.2249755859375,0.2249755859375,0.125,0.175048828125,0.0,0.07501220703125,0.04998779296875,0.0,0.199951171875,0.1500244140625,0.024993896484375,0.2249755859375,0.024993896484375,0.1500244140625,0.2249755859375,0.199951171875],[0.1500244140625,0.125,0.024993896484375,0.07501220703125,0.125,0.125,0.07501220703125,0.1500244140625,0.04998779296875,0.175048828125,0.125,0.175048828125,0.175048828125,0.07501220703125,0.024993896484375,0.125],[0.2249755859375,0.125,0.2249755859375,0.1500244140625,0.0,0.0,0.1500244140625,0.125,0.024993896484375,0.125,0.0,0.024993896484375,0.175048828125,0.175048828125,0.024993896484375,0.125],[0.2249755859375,0.024993896484375,0.04998779296875,0.0,0.0,0.1500244140625,0.07501220703125,0.2249755859375,0.1500244140625,0.024993896484375,0.0,0.0999755859375,0.125,0.1500244140625,0.2249755859375,0.0],[0.125,0.0999755859375,0.0,0.0999755859375,0.199951171875,0.125,0.175048828125,0.175048828125,0.1500244140625,0.2249755859375,0.04998779296875,0.125,0.1500244140625,0.0,0.0,0.0999755859375],[0.125,0.07501220703125,0.175048828125,0.1500244140625,0.175048828125,0.0,0.04998779296875,0.125,0.125,0.024993896484375,0.0999755859375,0.175048828125,0.024993896484375,0.0,0.024993896484375,0.0],[0.2249755859375,0.024993896484375,0.0999755859375,0.04998779296875,0.125,0.07501220703125,0.0999755859375,0.024993896484375,0.125,0.125,0.125,0.024993896484375,0.125,0.04998779296875,0.0999755859375,0.07501220703125],[0.0999755859375,0.175048828125,0.199951171875,0.0999755859375,0.175048828125,0.07501220703125,0.024993896484375,0.125,0.07501220703125,0.0,0.125,0.07501220703125,0.07501220703125,0.0,0.199951171875,0.175048828125],[0.07501220703125,0.0999755859375,0.175048828125,0.07501220703125,0.125,0.1500244140625,0.0,0.0999755859375,0.2249755859375,0.199951171875,0.04998779296875,0.0,0.0,0.1500244140625,0.199951171875,0.2249755859375],[0.024993896484375,0.2249755859375,0.04998779296875,0.1500244140625,0.2249755859375,0.2249755859375,0.175048828125,0.0999755859375,0.024993896484375,0.199951171875,0.125,0.199951171875,0.175048828125,0.2249755859375,0.175048828125,0.0999755859375],[0.125,0.0999755859375,0.04998779296875,0.125,0.199951171875,0.07501220703125,0.199951171875,0.0,0.024993896484375,0.04998779296875,0.0,0.04998779296875,0.04998779296875,0.199951171875,0.1500244140625,0.0999755859375],[0.199951171875,0.0,0.125,0.04998779296875,0.07501220703125,0.175048828125,0.0999755859375,0.175048828125,0.024993896484375,0.07501220703125,0.0,0.1500244140625,0.07501220703125,0.024993896484375,0.07501220703125,0.175048828125],[0.1500244140625,0.125,0.0999755859375,0.175048828125,0.04998779296875,0.0,0.04998779296875,0.1500244140625,0.024993896484375,0.125,0.125,0.175048828125,0.125,0.0999755859375,0.175048828125,0.1500244140625],[0.07501220703125,0.199951171875,0.024993896484375,0.0999755859375,0.175048828125,0.07501220703125,0.1500244140625,0.04998779296875,0.0,0.024993896484375,0.07501220703125,0.07501220703125,0.1500244140625,0.04998779296875,0.2249755859375,0.1500244140625]]" \

diff --git a/tests/transform_dialect/cuda/double_mma_layout_analysis_codegen_spec.mlir b/tests/transform_dialect/cuda/double_mma_layout_analysis_codegen_spec.mlir
index 11a6372..9982106 100644
--- a/tests/transform_dialect/cuda/double_mma_layout_analysis_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/double_mma_layout_analysis_codegen_spec.mlir

@@ -1,69 +1,73 @@
 // RUN: iree-opt %s
 
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @codegen(
+      %variant_op: !transform.any_op {transform.consumed}) {
 
-  // Step 1. Find the fill and matmul ops
-  // ===========================================================================
-  %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %fill0, %fill1 = transform.split_handle %fill : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %matmul0, %matmul1 = transform.split_handle %matmul : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    // Step 1. Find the fill and matmul ops
+    // ===========================================================================
+    %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    %fill0, %fill1 = transform.split_handle %fill : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %matmul0, %matmul1 = transform.split_handle %matmul : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
-  // Step 2. Tile the matmul and fuse the fill
-  // ===========================================================================
-  %grid_reduction, %forall_grid =
-  transform.structured.tile_using_forall %matmul1 tile_sizes [16] ( mapping = [#gpu.block<x>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
+    // Step 2. Tile the matmul and fuse the fill
+    // ===========================================================================
+    %grid_reduction, %forall_grid =
+    transform.structured.tile_using_forall %matmul1 tile_sizes [16] ( mapping = [#gpu.block<x>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
 
-  transform.structured.fuse_into_containing_op %fill1 into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.structured.fuse_into_containing_op %matmul0 into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.structured.fuse_into_containing_op %fill0 into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.structured.fuse_into_containing_op %fill1 into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.structured.fuse_into_containing_op %matmul0 into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.structured.fuse_into_containing_op %fill0 into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
 
-  // Step 3. Vectorize
-  // ===========================================================================
-  %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %func {
-    transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-    transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-    transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-  } : !transform.any_op
-  %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
+    // Step 3. Vectorize
+    // ===========================================================================
+    %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+      transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+      transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+    } : !transform.any_op
+    %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
 
-  // Step 4. Bufferize
-  // ===========================================================================
-  transform.apply_patterns to %func_3 {
-    transform.apply_patterns.iree.fold_fill_into_pad
-    transform.apply_patterns.linalg.tiling_canonicalization
-    transform.apply_patterns.scf.for_loop_canonicalization
-  } : !transform.any_op
-  transform.apply_patterns to %func_3 {
-    transform.apply_patterns.tensor.reassociative_reshape_folding
-    transform.apply_patterns.canonicalization
-  } : !transform.any_op
-  transform.iree.apply_cse %func_3 : !transform.any_op
-  transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
-  transform.apply_patterns to %func_3 {
-    transform.apply_patterns.linalg.erase_unnecessary_inputs
-  } : !transform.any_op
-  %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
-  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    // Step 4. Bufferize
+    // ===========================================================================
+    transform.apply_patterns to %func_3 {
+      transform.apply_patterns.iree.fold_fill_into_pad
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.scf.for_loop_canonicalization
+    } : !transform.any_op
+    transform.apply_patterns to %func_3 {
+      transform.apply_patterns.tensor.reassociative_reshape_folding
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.iree.apply_cse %func_3 : !transform.any_op
+    transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+    transform.apply_patterns to %func_3 {
+      transform.apply_patterns.linalg.erase_unnecessary_inputs
+    } : !transform.any_op
+    %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
+    %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
 
-  // Step 5. Pre-process the contract and transfer ops to put it in the right form.
-  // ===========================================================================
-  %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %func_2 {
-    transform.apply_patterns.iree.prepare_vector_to_mma
-  } : !transform.any_op
+    // Step 5. Pre-process the contract and transfer ops to put it in the right form.
+    // ===========================================================================
+    %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func_2 {
+      transform.apply_patterns.iree.prepare_vector_to_mma
+    } : !transform.any_op
 
-  // Step 6. Post-bufferization vector distribution
-  // ===========================================================================
-  %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-  transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
-  transform.iree.map_nested_forall_to_gpu_threads %func_7 workgroup_dims = [4, 8, 1] : (!transform.any_op) -> ()
+    // Step 6. Post-bufferization vector distribution
+    // ===========================================================================
+    %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
+    transform.iree.map_nested_forall_to_gpu_threads %func_7 workgroup_dims = [4, 8, 1] : (!transform.any_op) -> ()
 
-  // Step 7. Do layout analysis and lower to mma
-  // ===========================================================================
-  %func_10 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-  %func_11 = transform.iree.layout_analysis_and_distribution %func_10 : (!transform.any_op) -> (!transform.any_op)
-}
+    // Step 7. Do layout analysis and lower to mma
+    // ===========================================================================
+    %func_10 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %func_11 = transform.iree.layout_analysis_and_distribution %func_10 : (!transform.any_op) -> (!transform.any_op)
+
+    transform.yield 
+  }
+} // module

diff --git a/tests/transform_dialect/cuda/mma.mlir b/tests/transform_dialect/cuda/mma.mlir
index 92d5cf8..2093e1c 100644
--- a/tests/transform_dialect/cuda/mma.mlir
+++ b/tests/transform_dialect/cuda/mma.mlir

@@ -27,21 +27,25 @@
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%module: !transform.any_op):
-  %func = transform.structured.match ops{["func.func"]} in %module
-    : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %func {
-    transform.apply_patterns.iree.unroll_vectors_gpu_wmma_sync
-  } : !transform.any_op
-  transform.iree.vector.vector_to_mma_conversion %func { use_wmma } : (!transform.any_op) -> ()
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(
+      %module: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %module
+      : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.iree.unroll_vectors_gpu_wmma_sync
+    } : !transform.any_op
+    transform.iree.vector.vector_to_mma_conversion %func { use_wmma } : (!transform.any_op) -> ()
 
-  // Apply canonicalization post-hoc to trigger DCE and pass the test 
-  // (i.e. all vector.contract are dead).
-  // TODO: consider having the vector_to_mma_conversion do the DCE automatically.
-  transform.apply_patterns to %func {
-    transform.apply_patterns.canonicalization
-  } : !transform.any_op
+    // Apply canonicalization post-hoc to trigger DCE and pass the test 
+    // (i.e. all vector.contract are dead).
+    // TODO: consider having the vector_to_mma_conversion do the DCE automatically.
+    transform.apply_patterns to %func {
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+
+    transform.yield
+  }
 }
 
 // -----
@@ -71,20 +75,23 @@
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%module: !transform.any_op):
-  %func = transform.structured.match ops{["func.func"]} in %module
-    : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %func {
-    transform.apply_patterns.iree.unroll_vectors_gpu_mma_sync
-  } : !transform.any_op
-  transform.iree.vector.vector_to_mma_conversion %func { use_mma_sync } : (!transform.any_op) -> ()
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(
+      %module: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %module
+      : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.iree.unroll_vectors_gpu_mma_sync
+    } : !transform.any_op
+    transform.iree.vector.vector_to_mma_conversion %func { use_mma_sync } : (!transform.any_op) -> ()
 
-  // Apply canonicalization post-hoc to trigger DCE and pass the test 
-  // (i.e. all vector.contract are dead).
-  // TODO: consider having the vector_to_mma_conversion do the DCE automatically.
-  transform.apply_patterns to %func {
-    transform.apply_patterns.canonicalization
-  } : !transform.any_op
+    // Apply canonicalization post-hoc to trigger DCE and pass the test 
+    // (i.e. all vector.contract are dead).
+    // TODO: consider having the vector_to_mma_conversion do the DCE automatically.
+    transform.apply_patterns to %func {
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+
+    transform.yield
+  }
 }
-

diff --git a/tests/transform_dialect/cuda/mma_elemwise_layout_analysis.mlir b/tests/transform_dialect/cuda/mma_elemwise_layout_analysis.mlir
index e1cbe68..aaf5801 100644
--- a/tests/transform_dialect/cuda/mma_elemwise_layout_analysis.mlir
+++ b/tests/transform_dialect/cuda/mma_elemwise_layout_analysis.mlir

@@ -18,7 +18,7 @@
 // RUN: iree-compile %s --iree-hal-target-backends=cuda \
 // RUN:     --iree-hal-cuda-llvm-target-arch=sm_80 \
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=%p/mma_elemwise_layout_analysis_codegen_spec.mlir | \
+// RUN:     --iree-codegen-transform-dialect-library=%p/mma_elemwise_layout_analysis_codegen_spec.mlir | \
 // RUN: iree-run-module --module=- --function=matmul --device=cuda \
 // RUN: --input="16x16xf16=[[0.0999755859375,0.2249755859375,0.07501220703125,0.0,0.07501220703125,0.2249755859375,0.175048828125,0.07501220703125,0.175048828125,0.07501220703125,0.024993896484375,0.1500244140625,0.1500244140625,0.2249755859375,0.199951171875,0.1500244140625],[0.1500244140625,0.199951171875,0.0999755859375,0.07501220703125,0.1500244140625,0.2249755859375,0.024993896484375,0.0999755859375,0.0999755859375,0.024993896484375,0.2249755859375,0.2249755859375,0.2249755859375,0.0,0.024993896484375,0.04998779296875],[0.07501220703125,0.0,0.125,0.125,0.04998779296875,0.2249755859375,0.024993896484375,0.199951171875,0.199951171875,0.07501220703125,0.1500244140625,0.2249755859375,0.024993896484375,0.175048828125,0.07501220703125,0.125],[0.04998779296875,0.024993896484375,0.0,0.2249755859375,0.07501220703125,0.024993896484375,0.024993896484375,0.0,0.07501220703125,0.1500244140625,0.1500244140625,0.175048828125,0.2249755859375,0.1500244140625,0.07501220703125,0.0999755859375],[0.125,0.0,0.199951171875,0.04998779296875,0.199951171875,0.04998779296875,0.175048828125,0.125,0.0,0.0,0.199951171875,0.024993896484375,0.2249755859375,0.1500244140625,0.024993896484375,0.0],[0.04998779296875,0.2249755859375,0.0999755859375,0.07501220703125,0.2249755859375,0.07501220703125,0.2249755859375,0.07501220703125,0.2249755859375,0.199951171875,0.125,0.07501220703125,0.04998779296875,0.199951171875,0.125,0.1500244140625],[0.1500244140625,0.125,0.175048828125,0.04998779296875,0.125,0.1500244140625,0.1500244140625,0.125,0.0999755859375,0.0,0.199951171875,0.024993896484375,0.175048828125,0.199951171875,0.125,0.0999755859375],[0.0999755859375,0.199951171875,0.0999755859375,0.0999755859375,0.2249755859375,0.0,0.175048828125,0.0999755859375,0.125,0.07501220703125,0.07501220703125,0.175048828125,0.07501220703125,0.0,0.2249755859375,0.2249755859375],[0.07501220703125,0.024993896484375,0.199951171875,0.024993896484375,0.175048828125,0.199951171875,0.0999755859375,0.024993896484375,0.0,0.0999755859375,0.0,0.0999755859375,0.2249755859375,0.175048828125,0.0,0.0],[0.024993896484375,0.0999755859375,0.2249755859375,0.2249755859375,0.125,0.2249755859375,0.04998779296875,0.04998779296875,0.04998779296875,0.024993896484375,0.0999755859375,0.2249755859375,0.024993896484375,0.024993896484375,0.0,0.07501220703125],[0.0,0.1500244140625,0.175048828125,0.1500244140625,0.2249755859375,0.024993896484375,0.1500244140625,0.0999755859375,0.024993896484375,0.0,0.125,0.04998779296875,0.125,0.199951171875,0.024993896484375,0.199951171875],[0.024993896484375,0.04998779296875,0.199951171875,0.0,0.07501220703125,0.199951171875,0.2249755859375,0.04998779296875,0.175048828125,0.0,0.199951171875,0.199951171875,0.1500244140625,0.199951171875,0.125,0.199951171875],[0.1500244140625,0.125,0.04998779296875,0.0999755859375,0.04998779296875,0.175048828125,0.04998779296875,0.0999755859375,0.2249755859375,0.199951171875,0.125,0.1500244140625,0.0999755859375,0.07501220703125,0.07501220703125,0.0999755859375],[0.0,0.04998779296875,0.125,0.024993896484375,0.04998779296875,0.199951171875,0.04998779296875,0.0999755859375,0.199951171875,0.07501220703125,0.1500244140625,0.125,0.199951171875,0.199951171875,0.0,0.125],[0.024993896484375,0.07501220703125,0.0,0.199951171875,0.024993896484375,0.024993896484375,0.024993896484375,0.175048828125,0.04998779296875,0.04998779296875,0.04998779296875,0.07501220703125,0.07501220703125,0.1500244140625,0.175048828125,0.199951171875],[0.0,0.125,0.0,0.07501220703125,0.125,0.125,0.07501220703125,0.1500244140625,0.04998779296875,0.04998779296875,0.125,0.125,0.2249755859375,0.0999755859375,0.07501220703125,0.07501220703125]]" \
 // RUN: --input="8x16xf16=[[0.175049 0.0999756 0.0249939 0.224976 0.224976 0.199951 0.150024 0.0499878 0.224976 0.0249939 0.224976 0.150024 0.125 0.150024 0.125 0.125][0.0750122 0.175049 0.199951 0.0750122 0.224976 0.150024 0.125 0.175049 0.125 0.125 0.0249939 0.0249939 0.0999756 0.224976 0.0750122 0.0249939][0.199951 0.0750122 0 0.199951 0.125 0.0249939 0.0249939 0.125 0.224976 0 0.0499878 0 0 0.0499878 0.175049 0.0999756][0 0.0499878 0.150024 0.0999756 0.175049 0.224976 0.0750122 0.175049 0.150024 0.0249939 0 0.0999756 0.0999756 0.125 0.150024 0.175049][0.175049 0.125 0.175049 0.0999756 0 0.0249939 0.125 0.175049 0 0.175049 0 0.125 0.199951 0.150024 0.175049 0.0249939][0.125 0.125 0.0999756 0.224976 0.0750122 0.150024 0.125 0.0750122 0 0.175049 0.150024 0.150024 0.125 0 0 0][0.199951 0.0750122 0.175049 0.0999756 0.0499878 0.224976 0.0750122 0.0249939 0.150024 0.0249939 0.0750122 0.224976 0.175049 0 0.0499878 0.0249939][0.0499878 0.224976 0.150024 0.0999756 0 0.199951 0.150024 0.125 0.125 0.125 0.224976 0 0.175049 0.0999756 0.125 0]]" \

diff --git a/tests/transform_dialect/cuda/mma_elemwise_layout_analysis_codegen_spec.mlir b/tests/transform_dialect/cuda/mma_elemwise_layout_analysis_codegen_spec.mlir
index 3300cd0..e2d5780 100644
--- a/tests/transform_dialect/cuda/mma_elemwise_layout_analysis_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/mma_elemwise_layout_analysis_codegen_spec.mlir

@@ -1,61 +1,64 @@
 // RUN: iree-opt %s
 
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(
+      %variant_op: !transform.any_op {transform.consumed}) {
+    // Step 1. Find the fill, matmul and generic ops
+    // ===========================================================================
+    %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    %matmul = transform.structured.match ops{["linalg.matmul_transpose_b"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    %generic = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op
 
-  // Step 1. Find the fill, matmul and generic ops
-  // ===========================================================================
-  %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %matmul = transform.structured.match ops{["linalg.matmul_transpose_b"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %generic = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    // Step 2. Tile the generic and fuse the fill and matmul
+    // ===========================================================================
+    %grid_reduction, %forall_grid =
+    transform.structured.tile_using_forall %generic tile_sizes [16] ( mapping = [#gpu.block<x>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
 
-  // Step 2. Tile the generic and fuse the fill and matmul
-  // ===========================================================================
-  %grid_reduction, %forall_grid =
-  transform.structured.tile_using_forall %generic tile_sizes [16] ( mapping = [#gpu.block<x>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
+    transform.structured.fuse_into_containing_op %matmul into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.structured.fuse_into_containing_op %fill into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
 
-  transform.structured.fuse_into_containing_op %matmul into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.structured.fuse_into_containing_op %fill into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    // Step 3. Vectorize
+    // ===========================================================================
+    %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+      transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+      transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+    } : !transform.any_op
+    %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
 
-  // Step 3. Vectorize
-  // ===========================================================================
-  %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %func {
-    transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-    transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-    transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-  } : !transform.any_op
-  %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
+    // Step 4. Bufferize
+    // ===========================================================================
+    transform.apply_patterns to %func_3 {
+      transform.apply_patterns.iree.fold_fill_into_pad
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.scf.for_loop_canonicalization
+    } : !transform.any_op
+    transform.apply_patterns to %func_3 {
+      transform.apply_patterns.tensor.reassociative_reshape_folding
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.iree.apply_cse %func_3 : !transform.any_op
+    transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+    transform.apply_patterns to %func_3 {
+      transform.apply_patterns.linalg.erase_unnecessary_inputs
+    } : !transform.any_op
+    %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
+    %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
 
-  // Step 4. Bufferize
-  // ===========================================================================
-  transform.apply_patterns to %func_3 {
-    transform.apply_patterns.iree.fold_fill_into_pad
-    transform.apply_patterns.linalg.tiling_canonicalization
-    transform.apply_patterns.scf.for_loop_canonicalization
-  } : !transform.any_op
-  transform.apply_patterns to %func_3 {
-    transform.apply_patterns.tensor.reassociative_reshape_folding
-    transform.apply_patterns.canonicalization
-  } : !transform.any_op
-  transform.iree.apply_cse %func_3 : !transform.any_op
-  transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
-  transform.apply_patterns to %func_3 {
-    transform.apply_patterns.linalg.erase_unnecessary_inputs
-  } : !transform.any_op
-  %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
-  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    // Step 6. Post-bufferization vector distribution
+    // ===========================================================================
+    %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
+    transform.iree.map_nested_forall_to_gpu_threads %func_7
+        workgroup_dims = [4, 8, 1] : (!transform.any_op) -> ()
 
-  // Step 6. Post-bufferization vector distribution
-  // ===========================================================================
-  %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-  transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
-  transform.iree.map_nested_forall_to_gpu_threads %func_7
-      workgroup_dims = [4, 8, 1] : (!transform.any_op) -> ()
+    // Step 7. Do layout analysis and lower to mma
+    // ===========================================================================
+    %func_10 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %func_11 = transform.iree.layout_analysis_and_distribution %func_10 : (!transform.any_op) -> (!transform.any_op)
+    transform.yield 
+  }
+} // module
 
-  // Step 7. Do layout analysis and lower to mma
-  // ===========================================================================
-  %func_10 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-  %func_11 = transform.iree.layout_analysis_and_distribution %func_10 : (!transform.any_op) -> (!transform.any_op)
-}

diff --git a/tests/transform_dialect/cuda/mma_reduction_layout_analysis.mlir b/tests/transform_dialect/cuda/mma_reduction_layout_analysis.mlir
index 627d8f2..a99b19d 100644
--- a/tests/transform_dialect/cuda/mma_reduction_layout_analysis.mlir
+++ b/tests/transform_dialect/cuda/mma_reduction_layout_analysis.mlir

@@ -27,7 +27,8 @@
 // RUN:     --iree-hal-cuda-llvm-target-arch=sm_80 \
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
 // RUN:     --iree-flow-dispatch-use-transform-dialect=%p/mma_reduction_layout_analysis_dispatch_spec.mlir \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=%p/mma_reduction_layout_analysis_codegen_spec.mlir | \
+// RUN:     --iree-codegen-transform-dialect-library=%p/mma_reduction_layout_analysis_codegen_spec.mlir \
+// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
 // RUN: iree-run-module --module=- --function=matmul_reduction --device=cuda \
 // RUN: --input="16x16xf16=[[3.0,2.0,2.5,4.5,1.5,4.0,2.0,2.5,4.0,4.0,1.5,0.5,2.0,3.0,0.5,2.0],[2.5,2.5,0.5,3.5,0.0,2.5,3.5,1.0,0.5,0.0,3.0,4.5,0.5,0.5,0.0,3.5],[4.5,3.0,4.0,2.5,1.0,0.5,0.0,4.5,0.0,2.5,3.5,0.0,2.0,4.5,1.5,4.5],[0.0,2.0,1.5,0.0,2.0,1.5,3.0,2.0,2.0,4.0,4.0,2.5,0.0,3.0,2.0,0.5],[0.5,3.5,3.0,2.5,0.0,2.5,3.0,3.0,4.5,2.0,2.0,1.0,2.0,1.0,3.5,2.0],[0.0,4.5,2.0,4.0,2.5,2.5,1.5,1.5,1.5,3.0,3.0,0.0,2.5,0.5,2.0,2.0],[3.5,4.0,3.5,1.5,2.0,0.5,1.0,2.5,4.0,3.5,0.0,3.0,0.0,1.5,4.5,0.0],[4.5,3.5,1.0,4.5,0.5,0.0,1.5,4.5,1.5,3.5,3.0,2.5,0.0,0.5,0.0,4.0],[2.0,3.0,0.5,2.0,1.5,0.5,2.0,2.5,2.5,4.0,2.0,4.5,4.0,0.0,2.0,3.0],[2.5,4.0,4.0,3.0,2.0,2.0,4.5,0.5,4.5,1.0,2.0,0.0,4.5,1.0,3.0,0.5],[4.0,1.5,3.5,3.0,2.5,4.5,1.0,3.5,3.0,2.5,2.5,2.0,2.0,4.5,1.5,2.5],[3.0,3.0,0.0,2.5,1.0,3.0,0.0,1.5,1.5,2.5,0.5,1.0,3.0,3.5,1.5,1.5],[0.0,4.5,0.5,1.5,0.5,4.0,3.5,4.0,4.0,0.0,0.5,1.0,4.5,1.5,0.0,3.5],[2.5,2.0,2.5,1.5,3.0,0.0,2.0,1.0,2.5,4.0,0.0,4.0,4.0,1.5,3.0,2.5],[3.0,0.0,4.0,4.0,2.0,0.5,1.0,3.5,4.0,2.5,4.0,4.5,0.0,3.0,1.5,2.5],[0.5,0.5,2.5,4.0,1.0,2.5,0.5,4.5,2.0,3.0,1.5,4.5,1.5,4.5,0.5,1.5]]" \
 // RUN: --input="16x16xf16=[[3.5,3.0,4.5,3.0,3.0,0.0,2.0,2.5,2.0,0.0,4.5,2.5,0.5,0.0,4.0,3.5],[0.0,0.5,2.0,4.5,0.0,4.0,1.5,3.5,0.5,2.5,3.5,1.5,3.5,4.5,4.0,3.0],[3.0,3.5,2.5,1.5,1.5,1.5,0.5,4.5,0.0,3.5,4.0,0.0,0.0,2.0,0.5,1.0],[1.5,4.0,3.5,3.5,0.0,0.0,0.0,2.0,3.0,1.5,0.0,3.0,0.0,2.5,2.0,3.0],[3.5,4.0,2.5,1.5,3.0,2.0,3.0,4.5,1.5,3.0,2.0,3.5,2.5,4.5,0.5,3.5],[0.0,0.0,0.0,0.5,1.0,2.5,1.5,1.0,2.5,1.5,0.0,1.5,1.5,2.0,4.5,2.5],[4.0,1.5,3.0,2.5,2.5,3.5,2.0,4.0,1.5,2.5,0.5,4.0,1.0,4.5,3.5,0.0],[1.0,2.0,4.0,4.5,4.5,3.5,0.0,1.0,4.5,3.5,2.0,3.0,0.5,4.0,3.5,1.5],[1.0,0.0,2.5,4.5,0.0,2.0,0.0,2.5,3.0,4.0,2.5,0.5,3.5,0.0,3.5,1.0],[0.0,3.5,4.0,0.0,0.0,4.5,1.0,3.5,1.5,3.0,2.0,1.0,0.5,0.5,2.0,0.0],[1.5,0.0,4.5,2.0,4.5,4.5,3.5,3.0,2.5,4.5,0.5,0.5,0.0,4.5,0.0,4.0],[4.5,3.5,4.0,4.0,1.5,4.0,1.0,4.0,2.5,0.5,4.5,3.5,3.5,0.5,4.5,3.0],[0.0,3.0,2.5,1.0,1.5,2.0,1.0,1.5,4.0,2.5,3.5,1.0,3.5,2.5,3.5,4.5],[1.5,4.5,2.0,2.0,2.0,0.5,4.0,2.0,4.0,3.5,4.0,1.0,1.5,2.5,1.0,0.0],[0.0,0.0,1.0,2.5,3.5,2.5,4.0,0.0,2.0,2.0,4.5,0.5,1.0,3.5,3.0,2.5],[2.0,2.0,0.5,2.0,4.5,2.5,3.0,1.5,4.5,2.0,3.5,3.0,1.0,2.0,1.5,2.0]]" |\

diff --git a/tests/transform_dialect/cuda/mma_reduction_layout_analysis_codegen_spec.mlir b/tests/transform_dialect/cuda/mma_reduction_layout_analysis_codegen_spec.mlir
index 479aa34..1f2ca62 100644
--- a/tests/transform_dialect/cuda/mma_reduction_layout_analysis_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/mma_reduction_layout_analysis_codegen_spec.mlir

@@ -1,61 +1,65 @@
 // RUN: iree-opt %s
 
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @codegen(
+      %variant_op: !transform.any_op {transform.consumed}) {
 
-  // Step 1. Find the fill, matmul and generic ops
-  // ===========================================================================
-  %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %matmul = transform.structured.match ops{["linalg.matmul_transpose_b"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %generics = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %reduce, %broadcast = transform.split_handle %generics : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    // Step 1. Find the fill, matmul and generic ops
+    // ===========================================================================
+    %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    %matmul = transform.structured.match ops{["linalg.matmul_transpose_b"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    %generics = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    %reduce, %broadcast = transform.split_handle %generics : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
-  // Step 2. Tile the matmul and fuse the fill
-  // ===========================================================================
-  %grid_reduction, %forall_grid =
-  transform.structured.tile_using_forall %broadcast tile_sizes [16] ( mapping = [#gpu.block<x>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
-  transform.structured.fuse_into_containing_op %reduce into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.structured.fuse_into_containing_op %matmul into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.structured.fuse_into_containing_op %fill into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    // Step 2. Tile the matmul and fuse the fill
+    // ===========================================================================
+    %grid_reduction, %forall_grid =
+    transform.structured.tile_using_forall %broadcast tile_sizes [16] ( mapping = [#gpu.block<x>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
+    transform.structured.fuse_into_containing_op %reduce into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.structured.fuse_into_containing_op %matmul into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.structured.fuse_into_containing_op %fill into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
 
-  // Step 3. Vectorize
-  // ===========================================================================
-  %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %func {
-    transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-    transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-    transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-  } : !transform.any_op
-  %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
+    // Step 3. Vectorize
+    // ===========================================================================
+    %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+      transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+      transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+    } : !transform.any_op
+    %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
 
-  // Step 4. Bufferize
-  // ===========================================================================
-  transform.apply_patterns to %func_3 {
-    transform.apply_patterns.iree.fold_fill_into_pad
-    transform.apply_patterns.linalg.tiling_canonicalization
-    transform.apply_patterns.scf.for_loop_canonicalization
-  } : !transform.any_op
-  transform.apply_patterns to %func_3 {
-    transform.apply_patterns.tensor.reassociative_reshape_folding
-    transform.apply_patterns.canonicalization
-  } : !transform.any_op
-  transform.iree.apply_cse %func_3 : !transform.any_op
-  transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
-  transform.apply_patterns to %func_3 {
-    transform.apply_patterns.linalg.erase_unnecessary_inputs
-  } : !transform.any_op
-  %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
-  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    // Step 4. Bufferize
+    // ===========================================================================
+    transform.apply_patterns to %func_3 {
+      transform.apply_patterns.iree.fold_fill_into_pad
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.scf.for_loop_canonicalization
+    } : !transform.any_op
+    transform.apply_patterns to %func_3 {
+      transform.apply_patterns.tensor.reassociative_reshape_folding
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.iree.apply_cse %func_3 : !transform.any_op
+    transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+    transform.apply_patterns to %func_3 {
+      transform.apply_patterns.linalg.erase_unnecessary_inputs
+    } : !transform.any_op
+    %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
+    %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
 
-  // Step 6. Post-bufferization vector distribution
-  // ===========================================================================
-  %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-  transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
-  transform.iree.map_nested_forall_to_gpu_threads %func_7 workgroup_dims = [4, 8, 1] : (!transform.any_op) -> ()
+    // Step 6. Post-bufferization vector distribution
+    // ===========================================================================
+    %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
+    transform.iree.map_nested_forall_to_gpu_threads %func_7 workgroup_dims = [4, 8, 1] : (!transform.any_op) -> ()
 
-  // Step 7. Do layout analysis and lower to mma
-  // ===========================================================================
-  %func_10 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-  %func_11 = transform.iree.layout_analysis_and_distribution %func_10 : (!transform.any_op) -> (!transform.any_op)
-}
+    // Step 7. Do layout analysis and lower to mma
+    // ===========================================================================
+    %func_10 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %func_11 = transform.iree.layout_analysis_and_distribution %func_10 : (!transform.any_op) -> (!transform.any_op)
+
+    transform.yield 
+  }
+} // module

diff --git a/tests/transform_dialect/cuda/mma_using_layout_analysis.mlir b/tests/transform_dialect/cuda/mma_using_layout_analysis.mlir
index d0ca10c..132b9ea 100644
--- a/tests/transform_dialect/cuda/mma_using_layout_analysis.mlir
+++ b/tests/transform_dialect/cuda/mma_using_layout_analysis.mlir

@@ -10,7 +10,7 @@
 // RUN: iree-compile %s --iree-hal-target-backends=cuda \
 // RUN:     --iree-hal-cuda-llvm-target-arch=sm_80 \
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=%p/mma_using_layout_analysis_codegen_spec.mlir | \
+// RUN:     --iree-codegen-transform-dialect-library=%p/mma_using_layout_analysis_codegen_spec.mlir | \
 // RUN: iree-run-module --module=- --function=matmul --device=cuda \
 // RUN: --input="16x16xf16=[[1.0,1.125,1.25,1.375,1.5,1.625,1.75,1.875,2.0,2.125,2.25,2.375,2.5,2.625,2.75,2.875],[3.0,3.125,3.25,3.375,3.5,3.625,3.75,3.875,4.0,4.125,4.25,4.375,4.5,4.625,4.75,4.875],[5.0,5.125,5.25,5.375,5.5,5.625,5.75,5.875,6.0,6.125,6.25,6.375,6.5,6.625,6.75,6.875],[7.0,7.125,7.25,7.375,7.5,7.625,7.75,7.875,8.0,8.125,8.25,8.375,8.5,8.625,8.75,8.875],[9.0,9.125,9.25,9.375,9.5,9.625,9.75,9.875,10.0,10.125,10.25,10.375,10.5,10.625,10.75,10.875],[11.0,11.125,11.25,11.375,11.5,11.625,11.75,11.875,12.0,12.125,12.25,12.375,12.5,12.625,12.75,12.875],[13.0,13.125,13.25,13.375,13.5,13.625,13.75,13.875,14.0,14.125,14.25,14.375,14.5,14.625,14.75,14.875],[15.0,15.125,15.25,15.375,15.5,15.625,15.75,15.875,16.0,16.125,16.25,16.375,16.5,16.625,16.75,16.875],[17.0,17.125,17.25,17.375,17.5,17.625,17.75,17.875,18.0,18.125,18.25,18.375,18.5,18.625,18.75,18.875],[19.0,19.125,19.25,19.375,19.5,19.625,19.75,19.875,20.0,20.125,20.25,20.375,20.5,20.625,20.75,20.875],[21.0,21.125,21.25,21.375,21.5,21.625,21.75,21.875,22.0,22.125,22.25,22.375,22.5,22.625,22.75,22.875],[23.0,23.125,23.25,23.375,23.5,23.625,23.75,23.875,24.0,24.125,24.25,24.375,24.5,24.625,24.75,24.875],[25.0,25.125,25.25,25.375,25.5,25.625,25.75,25.875,26.0,26.125,26.25,26.375,26.5,26.625,26.75,26.875],[27.0,27.125,27.25,27.375,27.5,27.625,27.75,27.875,28.0,28.125,28.25,28.375,28.5,28.625,28.75,28.875],[29.0,29.125,29.25,29.375,29.5,29.625,29.75,29.875,30.0,30.125,30.25,30.375,30.5,30.625,30.75,30.875],[31.0,31.125,31.25,31.375,31.5,31.625,31.75,31.875,32.0,32.125,32.25,32.375,32.5,32.625,32.75,32.875]]" \
 // RUN: --input="16x8xf16=[[1.0,1.125,1.25,1.375,1.5,1.625,1.75,1.875],[2.0,2.125,2.25,2.375,2.5,2.625,2.75,2.875],[3.0,3.125,3.25,3.375,3.5,3.625,3.75,3.875],[4.0,4.125,4.25,4.375,4.5,4.625,4.75,4.875],[5.0,5.125,5.25,5.375,5.5,5.625,5.75,5.875],[6.0,6.125,6.25,6.375,6.5,6.625,6.75,6.875],[7.0,7.125,7.25,7.375,7.5,7.625,7.75,7.875],[8.0,8.125,8.25,8.375,8.5,8.625,8.75,8.875],[9.0,9.125,9.25,9.375,9.5,9.625,9.75,9.875],[10.0,10.125,10.25,10.375,10.5,10.625,10.75,10.875],[11.0,11.125,11.25,11.375,11.5,11.625,11.75,11.875],[12.0,12.125,12.25,12.375,12.5,12.625,12.75,12.875],[13.0,13.125,13.25,13.375,13.5,13.625,13.75,13.875],[14.0,14.125,14.25,14.375,14.5,14.625,14.75,14.875],[15.0,15.125,15.25,15.375,15.5,15.625,15.75,15.875],[16.0,16.125,16.25,16.375,16.5,16.625,16.75,16.875]]" |\

diff --git a/tests/transform_dialect/cuda/mma_using_layout_analysis_codegen_spec.mlir b/tests/transform_dialect/cuda/mma_using_layout_analysis_codegen_spec.mlir
index 9d76c73..cea3833 100644
--- a/tests/transform_dialect/cuda/mma_using_layout_analysis_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/mma_using_layout_analysis_codegen_spec.mlir

@@ -1,73 +1,75 @@
 // RUN: iree-opt %s
 
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(
+      %variant_op: !transform.any_op {transform.consumed}) {
+    // Step 1. Find the fill and matmul ops
+    // ===========================================================================
+    %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op
 
-  // Step 1. Find the fill and matmul ops
-  // ===========================================================================
-  %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    // Step 2. Tile the matmul and fuse the fill
+    // ===========================================================================
+    %grid_reduction, %forall_grid =
+    transform.structured.tile_using_forall %matmul tile_sizes [16] ( mapping = [#gpu.block<x>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
 
-  // Step 2. Tile the matmul and fuse the fill
-  // ===========================================================================
-  %grid_reduction, %forall_grid =
-  transform.structured.tile_using_forall %matmul tile_sizes [16] ( mapping = [#gpu.block<x>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
+    transform.structured.fuse_into_containing_op %fill into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
 
-  transform.structured.fuse_into_containing_op %fill into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  // Promote operands in order to test loading from shared memory.
-  %matmul_2 = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %promoted_matmul, %alloc_0, %alloc_1 =
-    transform.iree.promote_operands %matmul_2 [0, 1] 
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+    // Promote operands in order to test loading from shared memory.
+    %matmul_2 = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    %promoted_matmul, %alloc_0, %alloc_1 =
+      transform.iree.promote_operands %matmul_2 [0, 1] 
+        : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
 
 
-  // Step 3. Vectorize
-  // ===========================================================================
-  %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %func {
-    transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-    transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-    transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-  } : !transform.any_op
-  %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
+    // Step 3. Vectorize
+    // ===========================================================================
+    %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+      transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+      transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+    } : !transform.any_op
+    %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
 
-  // Step 4. Bufferize
-  // ===========================================================================
-  transform.apply_patterns to %func_3 {
-    transform.apply_patterns.iree.fold_fill_into_pad
-    transform.apply_patterns.linalg.tiling_canonicalization
-    transform.apply_patterns.scf.for_loop_canonicalization
-  } : !transform.any_op
-  transform.apply_patterns to %func_3 {
-    transform.apply_patterns.tensor.reassociative_reshape_folding
-    transform.apply_patterns.canonicalization
-  } : !transform.any_op
-  transform.iree.apply_cse %func_3 : !transform.any_op
-  transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
-  transform.apply_patterns to %func_3 {
-    transform.apply_patterns.linalg.erase_unnecessary_inputs
-  } : !transform.any_op
-  %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
-  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    // Step 4. Bufferize
+    // ===========================================================================
+    transform.apply_patterns to %func_3 {
+      transform.apply_patterns.iree.fold_fill_into_pad
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.scf.for_loop_canonicalization
+    } : !transform.any_op
+    transform.apply_patterns to %func_3 {
+      transform.apply_patterns.tensor.reassociative_reshape_folding
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.iree.apply_cse %func_3 : !transform.any_op
+    transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+    transform.apply_patterns to %func_3 {
+      transform.apply_patterns.linalg.erase_unnecessary_inputs
+    } : !transform.any_op
+    %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
+    %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
 
-  // Step 5. Pre-process the contract and transfer ops to put it in the right form.
-  // ===========================================================================
-  %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %func_2 {
-    transform.apply_patterns.iree.prepare_vector_to_mma
-  } : !transform.any_op
+    // Step 5. Pre-process the contract and transfer ops to put it in the right form.
+    // ===========================================================================
+    %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func_2 {
+      transform.apply_patterns.iree.prepare_vector_to_mma
+    } : !transform.any_op
 
-  // Step 6. Post-bufferization vector distribution
-  // ===========================================================================
-  %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-  transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
-  transform.iree.map_nested_forall_to_gpu_threads %func_7
-      workgroup_dims = [4, 8, 1] : (!transform.any_op) -> ()
+    // Step 6. Post-bufferization vector distribution
+    // ===========================================================================
+    %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
+    transform.iree.map_nested_forall_to_gpu_threads %func_7
+        workgroup_dims = [4, 8, 1] : (!transform.any_op) -> ()
 
-  // Step 7. Do layout analysis and lower to mma
-  // ===========================================================================
-  %func_10 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-  %func_11 = transform.iree.layout_analysis_and_distribution %func_10 : (!transform.any_op) -> (!transform.any_op)
-}
+    // Step 7. Do layout analysis and lower to mma
+    // ===========================================================================
+    %func_10 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %func_11 = transform.iree.layout_analysis_and_distribution %func_10 : (!transform.any_op) -> (!transform.any_op)
+    transform.yield 
+  }
+} // module

diff --git a/tests/transform_dialect/cuda/reduction.mlir b/tests/transform_dialect/cuda/reduction.mlir
index e814353..500eba6 100644
--- a/tests/transform_dialect/cuda/reduction.mlir
+++ b/tests/transform_dialect/cuda/reduction.mlir

@@ -25,7 +25,8 @@
 // RUN:     --iree-hal-configuration-pipeline | \
 // RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-lower-executable-target)))' \
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=%p/reduction_codegen_spec.mlir | \
+// RUN:     --iree-codegen-transform-dialect-library=%p/reduction_codegen_spec.mlir \
+// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
 // RUN: FileCheck %s --check-prefix=CHECK
 
 // RUN: iree-compile %s --iree-hal-target-backends=cuda \
@@ -33,7 +34,8 @@
 /// Constant JIT'ing must be disabled because the transform-dialect debug
 /// flags leak to the JIT session, which doesn't know what to do with them.
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=%p/reduction_codegen_spec.mlir | \
+// RUN:     --iree-codegen-transform-dialect-library=%p/reduction_codegen_spec.mlir \
+// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
 // RUN: iree-run-module --module=- --function=reduce --device=cuda --input="8x64xf32=1" |\
 // RUN: FileCheck %s --check-prefix=EXEC
 

diff --git a/tests/transform_dialect/cuda/reduction_codegen_spec.mlir b/tests/transform_dialect/cuda/reduction_codegen_spec.mlir
index 4fc6a49..0ab6d6a 100644
--- a/tests/transform_dialect/cuda/reduction_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/reduction_codegen_spec.mlir

@@ -1,115 +1,120 @@
 // RUN: iree-opt %s
 
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
-  %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @codegen(
+      %variant_op: !transform.any_op {transform.consumed}) {
 
-  // Step 1. Split the reduction to get meatier (size(red) / 2)-way parallelism.
-  // ===========================================================================
-  %0 = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %init_or_alloc_op, %more_parallel_fill_op, %more_parallel_op, %combiner_op =
-    transform.structured.split_reduction %0
-      { split_factor = 2, insert_split_dimension = 1 }
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+    %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
 
-  // Step 2. First level of tiling + fusion parallelizes to blocks.
-  // ===========================================================================
-  %grid_combiner_op, %forall_grid =
-    transform.structured.tile_using_forall %combiner_op tile_sizes [1]
-      ( mapping = [#gpu.block<x>] )
-       : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
-  %not_combiner = transform.merge_handles %fill, %more_parallel_fill_op, %more_parallel_op : !transform.any_op
-  transform.structured.fuse_into_containing_op %not_combiner into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    // Step 1. Split the reduction to get meatier (size(red) / 2)-way parallelism.
+    // ===========================================================================
+    %0 = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    %init_or_alloc_op, %more_parallel_fill_op, %more_parallel_op, %combiner_op =
+      transform.structured.split_reduction %0
+        { split_factor = 2, insert_split_dimension = 1 }
+        : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
 
-  // Step 3. Second level of tiling + fusion parallelizes to threads.
-  // ===========================================================================
-  %fill_1d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1xf32> in %variant_op 
-    : (!transform.any_op) -> !transform.any_op
-  %block_combiner_op, %forall_block_combiner_op =
-    transform.structured.tile_using_forall %grid_combiner_op tile_sizes [1] 
-    ( mapping = [#gpu.thread<z>] )
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.structured.fuse_into_containing_op %fill_1d into %forall_block_combiner_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    // Step 2. First level of tiling + fusion parallelizes to blocks.
+    // ===========================================================================
+    %grid_combiner_op, %forall_grid =
+      transform.structured.tile_using_forall %combiner_op tile_sizes [1]
+        ( mapping = [#gpu.block<x>] )
+        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
+    %not_combiner = transform.merge_handles %fill, %more_parallel_fill_op, %more_parallel_op : !transform.any_op
+    transform.structured.fuse_into_containing_op %not_combiner into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
 
-  // Canonicalizations.
-  %func_op = transform.structured.match ops{["func.func"]} in %variant_op 
+    // Step 3. Second level of tiling + fusion parallelizes to threads.
+    // ===========================================================================
+    %fill_1d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1xf32> in %variant_op 
       : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %func_op {
-    transform.apply_patterns.iree.fold_fill_into_pad
-    transform.apply_patterns.linalg.tiling_canonicalization
-    transform.apply_patterns.scf.for_loop_canonicalization
-  } : !transform.any_op
-  transform.iree.apply_licm %func_op : !transform.any_op
-  transform.iree.apply_cse %func_op : !transform.any_op
+    %block_combiner_op, %forall_block_combiner_op =
+      transform.structured.tile_using_forall %grid_combiner_op tile_sizes [1] 
+      ( mapping = [#gpu.thread<z>] )
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.structured.fuse_into_containing_op %fill_1d into %forall_block_combiner_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
 
-  %fill_2d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1x2xf32> in %variant_op 
-    : (!transform.any_op) -> !transform.any_op
-  %grid_more_parallel_op = transform.structured.match ops{["linalg.generic"]}
-    attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op 
+    // Canonicalizations.
+    %func_op = transform.structured.match ops{["func.func"]} in %variant_op 
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func_op {
+      transform.apply_patterns.iree.fold_fill_into_pad
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.scf.for_loop_canonicalization
+    } : !transform.any_op
+    transform.iree.apply_licm %func_op : !transform.any_op
+    transform.iree.apply_cse %func_op : !transform.any_op
+
+    %fill_2d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1x2xf32> in %variant_op 
       : (!transform.any_op) -> !transform.any_op
-  %block_more_parallel_op, %forall_block_more_parallel_op =
-    transform.structured.tile_using_forall %grid_more_parallel_op tile_sizes [1, 1] 
-    ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.structured.fuse_into_containing_op %fill_2d into %forall_block_more_parallel_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %grid_more_parallel_op = transform.structured.match ops{["linalg.generic"]}
+      attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op 
+        : (!transform.any_op) -> !transform.any_op
+    %block_more_parallel_op, %forall_block_more_parallel_op =
+      transform.structured.tile_using_forall %grid_more_parallel_op tile_sizes [1, 1] 
+      ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.structured.fuse_into_containing_op %fill_2d into %forall_block_more_parallel_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
 
-  // Step 4. Rank-reduce and vectorize.
-  // ===========================================================================
-  %func = transform.structured.match ops{["func.func"]} in %variant_op 
-    : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %func {
-    transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-    transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-    transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-  } : !transform.any_op
-  %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
+    // Step 4. Rank-reduce and vectorize.
+    // ===========================================================================
+    %func = transform.structured.match ops{["func.func"]} in %variant_op 
+      : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+      transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+      transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+    } : !transform.any_op
+    %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
 
-  // Step 5. Bufferize and drop HAL decriptor from memref ops.
-  // ===========================================================================
-  transform.apply_patterns to %func_3 {
-    transform.apply_patterns.tensor.reassociative_reshape_folding
-  } : !transform.any_op
-  transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
-  %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> !transform.any_op
-  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 
-    : (!transform.any_op) -> !transform.any_op
+    // Step 5. Bufferize and drop HAL decriptor from memref ops.
+    // ===========================================================================
+    transform.apply_patterns to %func_3 {
+      transform.apply_patterns.tensor.reassociative_reshape_folding
+    } : !transform.any_op
+    transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+    %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> !transform.any_op
+    %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 
+      : (!transform.any_op) -> !transform.any_op
 
-  // Step 6. Post-bufferization mapping to blocks and threads.
-  // ===========================================================================
-  %func_5 = transform.structured.match ops{["func.func"]} in %variant_op_3 
-    : (!transform.any_op) -> !transform.any_op
-  transform.iree.forall_to_workgroup %func_5 : (!transform.any_op) -> ()
-  transform.iree.map_nested_forall_to_gpu_threads %func_5
-      workgroup_dims = [32, 2, 1] : (!transform.any_op) -> ()
+    // Step 6. Post-bufferization mapping to blocks and threads.
+    // ===========================================================================
+    %func_5 = transform.structured.match ops{["func.func"]} in %variant_op_3 
+      : (!transform.any_op) -> !transform.any_op
+    transform.iree.forall_to_workgroup %func_5 : (!transform.any_op) -> ()
+    transform.iree.map_nested_forall_to_gpu_threads %func_5
+        workgroup_dims = [32, 2, 1] : (!transform.any_op) -> ()
 
-  // Step 7. Post-bufferization vector distribution with rank-reduction.
-  // ===========================================================================
-  transform.apply_patterns to %func_5 {
-    transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-    transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-    transform.apply_patterns.memref.fold_memref_alias_ops
-    transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-  } : !transform.any_op
-  %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3 
-    : (!transform.any_op) -> !transform.any_op
-  // Don't complain about unsupported if (threadIdx.x == 0 && threadIdx.y == 0)
-  // at this point.
-  transform.sequence %variant_op_3 : !transform.any_op failures(suppress) {
-  ^bb0(%arg0: !transform.any_op):
-    transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 } : (!transform.any_op) -> !transform.any_op
+    // Step 7. Post-bufferization vector distribution with rank-reduction.
+    // ===========================================================================
+    transform.apply_patterns to %func_5 {
+      transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+      transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+      transform.apply_patterns.memref.fold_memref_alias_ops
+      transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+    } : !transform.any_op
+    %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3 
+      : (!transform.any_op) -> !transform.any_op
+    // Don't complain about unsupported if (threadIdx.x == 0 && threadIdx.y == 0)
+    // at this point.
+    transform.sequence %variant_op_3 : !transform.any_op failures(suppress) {
+    ^bb0(%arg0: !transform.any_op):
+      transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 } : (!transform.any_op) -> !transform.any_op
+    }
+    transform.iree.vector.warp_distribute %func_5 : (!transform.any_op) -> ()
+
+
+    // Late Canonicalizations.
+    %func_op_3 = transform.structured.match ops{["func.func"]} in %variant_op_3
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func_op_3 {
+      transform.apply_patterns.iree.fold_fill_into_pad
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.scf.for_loop_canonicalization
+    } : !transform.any_op
+    transform.iree.apply_licm %func_op_3 : !transform.any_op
+    transform.iree.apply_cse %func_op_3 : !transform.any_op
+
+    transform.yield 
   }
-  transform.iree.vector.warp_distribute %func_5 : (!transform.any_op) -> ()
-
-
-  // Late Canonicalizations.
-  %func_op_3 = transform.structured.match ops{["func.func"]} in %variant_op_3
-      : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %func_op_3 {
-    transform.apply_patterns.iree.fold_fill_into_pad
-    transform.apply_patterns.linalg.tiling_canonicalization
-    transform.apply_patterns.scf.for_loop_canonicalization
-  } : !transform.any_op
-  transform.iree.apply_licm %func_op_3 : !transform.any_op
-  transform.iree.apply_cse %func_op_3 : !transform.any_op
-}
+} // module

diff --git a/tests/transform_dialect/cuda/reduction_eltwise.mlir b/tests/transform_dialect/cuda/reduction_eltwise.mlir
index 5e75858..a266998 100644
--- a/tests/transform_dialect/cuda/reduction_eltwise.mlir
+++ b/tests/transform_dialect/cuda/reduction_eltwise.mlir

@@ -36,12 +36,14 @@
 // RUN:     --iree-hal-configuration-pipeline | \
 // RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-lower-executable-target)))' \
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=%p/reduction_eltwise_codegen_spec.mlir | \
+// RUN:     --iree-codegen-transform-dialect-library=%p/reduction_eltwise_codegen_spec.mlir \
+// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
 // RUN: FileCheck %s --check-prefix=CHECK
 
 // RUN: iree-compile %s --iree-hal-target-backends=cuda \
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=%p/reduction_eltwise_codegen_spec.mlir | \
+// RUN:     --iree-codegen-transform-dialect-library=%p/reduction_eltwise_codegen_spec.mlir \
+// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
 // RUN: iree-run-module --module=- --function=reduce --device=cuda --input="8x64xf32=1" |\
 // RUN: FileCheck %s --check-prefix=EXEC
 

diff --git a/tests/transform_dialect/cuda/reduction_eltwise_codegen_spec.mlir b/tests/transform_dialect/cuda/reduction_eltwise_codegen_spec.mlir
index 4a95480..fb4bcd6 100644
--- a/tests/transform_dialect/cuda/reduction_eltwise_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/reduction_eltwise_codegen_spec.mlir

@@ -1,155 +1,160 @@
 // RUN: iree-opt %s
 
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
-  %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op
-    : (!transform.any_op) -> !transform.any_op
-
-  // Step 1. Split the reduction to get meatier (size(red) / 2)-way parallelism.
-  // ===========================================================================
-  %0 = transform.structured.match ops{["linalg.generic"]} in %variant_op
-    : (!transform.any_op) -> !transform.any_op
-  %reduction, %eltwise = transform.split_handle %0
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %init_or_alloc_op, %more_parallel_fill_op, %more_parallel_op, %combiner_op =
-    transform.structured.split_reduction %reduction
-      { split_factor = 2, insert_split_dimension = 1 }
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-
-  // Canonicalizations.
-  %func_op = transform.structured.match ops{["func.func"]} in %variant_op
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @codegen(
+      %variant_op: !transform.any_op {transform.consumed}) {
+    %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op
       : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %func_op {
-    transform.apply_patterns.iree.fold_fill_into_pad
-    transform.apply_patterns.linalg.tiling_canonicalization
-    transform.apply_patterns.scf.for_loop_canonicalization
-    transform.apply_patterns.canonicalization
-  } : !transform.any_op
-  transform.iree.apply_licm %func_op : !transform.any_op
-  transform.iree.apply_cse %func_op : !transform.any_op
 
-  // Step 2. First level of tiling + fusion parallelizes to blocks. Tile the
-  // trailing elementwise the same way we want to tile the reduction.
-  // ===========================================================================
-  %eltwise_grid_op, %grid_loop = transform.structured.tile_using_forall %eltwise
-    tile_sizes [1] (mapping = [#gpu.block<x>])
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.iree.populate_workgroup_count_region_using_num_threads_slice %grid_loop : (!transform.any_op) -> ()
-  %not_eltwise = transform.merge_handles %fill, %more_parallel_fill_op, %more_parallel_op, %combiner_op
-    : !transform.any_op
-  transform.structured.fuse_into_containing_op %not_eltwise into %grid_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  // Canonicalizations.
-  transform.apply_patterns to %func_op {
-    transform.apply_patterns.iree.fold_fill_into_pad
-    transform.apply_patterns.linalg.tiling_canonicalization
-    transform.apply_patterns.scf.for_loop_canonicalization
-    transform.apply_patterns.canonicalization
-  } : !transform.any_op
-  transform.iree.apply_licm %func_op : !transform.any_op
-  transform.iree.apply_cse %func_op : !transform.any_op
-
-  // Step 3. Second level of tiling + fusion parallelizes to threads.
-  // ===========================================================================
-  %fill_1d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1xf32> in %variant_op
-    : (!transform.any_op) -> !transform.any_op
-  %eltwise_block_op, %eltwise_block_loop =
-    transform.structured.tile_using_forall %eltwise_grid_op tile_sizes [1]
-    ( mapping = [#gpu.thread<z>] )
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %block_combiner_op = transform.structured.match ops{["linalg.generic"]}
-    attributes {iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op
+    // Step 1. Split the reduction to get meatier (size(red) / 2)-way parallelism.
+    // ===========================================================================
+    %0 = transform.structured.match ops{["linalg.generic"]} in %variant_op
       : (!transform.any_op) -> !transform.any_op
-  %combined_and_fill = transform.merge_handles %fill_1d, %block_combiner_op : !transform.any_op
-  transform.structured.fuse_into_containing_op %combined_and_fill into %eltwise_block_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %reduction, %eltwise = transform.split_handle %0
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %init_or_alloc_op, %more_parallel_fill_op, %more_parallel_op, %combiner_op =
+      transform.structured.split_reduction %reduction
+        { split_factor = 2, insert_split_dimension = 1 }
+        : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
 
-  // Canonicalizations.
-  transform.apply_patterns to %func_op {
-    transform.apply_patterns.iree.fold_fill_into_pad
-    transform.apply_patterns.linalg.tiling_canonicalization
-    transform.apply_patterns.scf.for_loop_canonicalization
-    transform.apply_patterns.canonicalization
-  } : !transform.any_op
-  transform.iree.apply_licm %func_op : !transform.any_op
-  transform.iree.apply_cse %func_op : !transform.any_op
+    // Canonicalizations.
+    %func_op = transform.structured.match ops{["func.func"]} in %variant_op
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func_op {
+      transform.apply_patterns.iree.fold_fill_into_pad
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.scf.for_loop_canonicalization
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.iree.apply_licm %func_op : !transform.any_op
+    transform.iree.apply_cse %func_op : !transform.any_op
 
-  %fill_2d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1x2xf32> in %variant_op
-    : (!transform.any_op) -> !transform.any_op
-  %grid_more_parallel_op = transform.structured.match ops{["linalg.generic"]}
-    attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op
+    // Step 2. First level of tiling + fusion parallelizes to blocks. Tile the
+    // trailing elementwise the same way we want to tile the reduction.
+    // ===========================================================================
+    %eltwise_grid_op, %grid_loop = transform.structured.tile_using_forall %eltwise
+      tile_sizes [1] (mapping = [#gpu.block<x>])
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.iree.populate_workgroup_count_region_using_num_threads_slice %grid_loop : (!transform.any_op) -> ()
+    %not_eltwise = transform.merge_handles %fill, %more_parallel_fill_op, %more_parallel_op, %combiner_op
+      : !transform.any_op
+    transform.structured.fuse_into_containing_op %not_eltwise into %grid_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    // Canonicalizations.
+    transform.apply_patterns to %func_op {
+      transform.apply_patterns.iree.fold_fill_into_pad
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.scf.for_loop_canonicalization
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.iree.apply_licm %func_op : !transform.any_op
+    transform.iree.apply_cse %func_op : !transform.any_op
+
+    // Step 3. Second level of tiling + fusion parallelizes to threads.
+    // ===========================================================================
+    %fill_1d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1xf32> in %variant_op
       : (!transform.any_op) -> !transform.any_op
-  %block_more_parallel_op, %forall_block_more_parallel_op =
-    transform.structured.tile_using_forall %grid_more_parallel_op tile_sizes [1, 1]
-    ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.structured.fuse_into_containing_op %fill_2d into %forall_block_more_parallel_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %eltwise_block_op, %eltwise_block_loop =
+      transform.structured.tile_using_forall %eltwise_grid_op tile_sizes [1]
+      ( mapping = [#gpu.thread<z>] )
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %block_combiner_op = transform.structured.match ops{["linalg.generic"]}
+      attributes {iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op
+        : (!transform.any_op) -> !transform.any_op
+    %combined_and_fill = transform.merge_handles %fill_1d, %block_combiner_op : !transform.any_op
+    transform.structured.fuse_into_containing_op %combined_and_fill into %eltwise_block_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
 
-  // Canonicalizations.
-  transform.apply_patterns to %func_op {
-    transform.apply_patterns.iree.fold_fill_into_pad
-    transform.apply_patterns.linalg.tiling_canonicalization
-    transform.apply_patterns.scf.for_loop_canonicalization
-    transform.apply_patterns.canonicalization
-  } : !transform.any_op
-  transform.iree.apply_licm %func_op : !transform.any_op
-  transform.iree.apply_cse %func_op : !transform.any_op
+    // Canonicalizations.
+    transform.apply_patterns to %func_op {
+      transform.apply_patterns.iree.fold_fill_into_pad
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.scf.for_loop_canonicalization
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.iree.apply_licm %func_op : !transform.any_op
+    transform.iree.apply_cse %func_op : !transform.any_op
 
-  // Step 4. Rank-reduce and vectorize.
-  // ===========================================================================
-  %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %func {
-    transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-    transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-    transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-  } : !transform.any_op
-  %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
+    %fill_2d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1x2xf32> in %variant_op
+      : (!transform.any_op) -> !transform.any_op
+    %grid_more_parallel_op = transform.structured.match ops{["linalg.generic"]}
+      attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op
+        : (!transform.any_op) -> !transform.any_op
+    %block_more_parallel_op, %forall_block_more_parallel_op =
+      transform.structured.tile_using_forall %grid_more_parallel_op tile_sizes [1, 1]
+      ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.structured.fuse_into_containing_op %fill_2d into %forall_block_more_parallel_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
 
-  // Step 5. Bufferize and drop HAL decriptor from memref ops.
-  // ===========================================================================
-  transform.apply_patterns to %func_3 {
-    transform.apply_patterns.tensor.reassociative_reshape_folding
-  } : !transform.any_op
-  transform.iree.eliminate_empty_tensors %variant_op: (!transform.any_op) -> ()
-  %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
-  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3
-    : (!transform.any_op) -> !transform.any_op
+    // Canonicalizations.
+    transform.apply_patterns to %func_op {
+      transform.apply_patterns.iree.fold_fill_into_pad
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.scf.for_loop_canonicalization
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.iree.apply_licm %func_op : !transform.any_op
+    transform.iree.apply_cse %func_op : !transform.any_op
 
-  // Step 6. Post-bufferization mapping to blocks and threads.
-  // ===========================================================================
-  %func_5 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-  transform.iree.forall_to_workgroup %func_5 : (!transform.any_op) -> ()
-  transform.iree.map_nested_forall_to_gpu_threads %func_5
-      workgroup_dims = [32, 2, 1] : (!transform.any_op) -> ()
+    // Step 4. Rank-reduce and vectorize.
+    // ===========================================================================
+    %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+      transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+      transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+    } : !transform.any_op
+    %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
 
-  // Step 7. Post-bufferization vector distribution with rank-reduction.
-  // ===========================================================================
-  transform.apply_patterns to %func_5 {
-    transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-    transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-    transform.apply_patterns.memref.fold_memref_alias_ops
-    transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-  } : !transform.any_op
-  %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3
-    : (!transform.any_op) -> !transform.any_op
-  // Don't complain about unsupported if (threadIdx.x == 0 && threadIdx.y == 0)
-  // at this point.
-  transform.sequence %variant_op_3 : !transform.any_op failures(suppress) {
-  ^bb0(%arg0: !transform.any_op):
-    transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 }
-    : (!transform.any_op) -> !transform.any_op
+    // Step 5. Bufferize and drop HAL decriptor from memref ops.
+    // ===========================================================================
+    transform.apply_patterns to %func_3 {
+      transform.apply_patterns.tensor.reassociative_reshape_folding
+    } : !transform.any_op
+    transform.iree.eliminate_empty_tensors %variant_op: (!transform.any_op) -> ()
+    %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
+    %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3
+      : (!transform.any_op) -> !transform.any_op
+
+    // Step 6. Post-bufferization mapping to blocks and threads.
+    // ===========================================================================
+    %func_5 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    transform.iree.forall_to_workgroup %func_5 : (!transform.any_op) -> ()
+    transform.iree.map_nested_forall_to_gpu_threads %func_5
+        workgroup_dims = [32, 2, 1] : (!transform.any_op) -> ()
+
+    // Step 7. Post-bufferization vector distribution with rank-reduction.
+    // ===========================================================================
+    transform.apply_patterns to %func_5 {
+      transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+      transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+      transform.apply_patterns.memref.fold_memref_alias_ops
+      transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+    } : !transform.any_op
+    %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3
+      : (!transform.any_op) -> !transform.any_op
+    // Don't complain about unsupported if (threadIdx.x == 0 && threadIdx.y == 0)
+    // at this point.
+    transform.sequence %variant_op_3 : !transform.any_op failures(suppress) {
+    ^bb0(%arg0: !transform.any_op):
+      transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 }
+      : (!transform.any_op) -> !transform.any_op
+    }
+    transform.iree.vector.warp_distribute %func_5 : (!transform.any_op) -> ()
+
+
+    // Late canonicalizations.
+    %func_op_3 = transform.structured.match ops{["func.func"]} in %variant_op_3
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func_op_3 {
+      transform.apply_patterns.iree.fold_fill_into_pad
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.scf.for_loop_canonicalization
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.iree.apply_licm %func_op_3 : !transform.any_op
+    transform.iree.apply_cse %func_op_3 : !transform.any_op
+
+    transform.yield 
   }
-  transform.iree.vector.warp_distribute %func_5 : (!transform.any_op) -> ()
+} // module
 
-
-  // Late canonicalizations.
-  %func_op_3 = transform.structured.match ops{["func.func"]} in %variant_op_3
-      : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %func_op_3 {
-    transform.apply_patterns.iree.fold_fill_into_pad
-    transform.apply_patterns.linalg.tiling_canonicalization
-    transform.apply_patterns.scf.for_loop_canonicalization
-    transform.apply_patterns.canonicalization
-  } : !transform.any_op
-  transform.iree.apply_licm %func_op_3 : !transform.any_op
-  transform.iree.apply_cse %func_op_3 : !transform.any_op
-}

diff --git a/tests/transform_dialect/cuda/reduction_v2.mlir b/tests/transform_dialect/cuda/reduction_v2.mlir
index de96cc1..6ff6442 100644
--- a/tests/transform_dialect/cuda/reduction_v2.mlir
+++ b/tests/transform_dialect/cuda/reduction_v2.mlir

@@ -25,12 +25,14 @@
 // RUN:     --iree-hal-configuration-pipeline | \
 // RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-lower-executable-target)))' \
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=%p/reduction_v2_codegen_spec.mlir | \
+// RUN:     --iree-codegen-transform-dialect-library=%p/reduction_v2_codegen_spec.mlir \
+// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
 // RUN: FileCheck %s --check-prefix=CHECK
 
 // RUN: iree-compile %s --iree-hal-target-backends=cuda \
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=%p/reduction_v2_codegen_spec.mlir | \
+// RUN:     --iree-codegen-transform-dialect-library=%p/reduction_v2_codegen_spec.mlir \
+// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
 // RUN: iree-run-module --module=- --function=reduce --device=cuda --input="33x1024xf32=1" |\
 // RUN: FileCheck %s --check-prefix=EXEC
 

diff --git a/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir b/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir
index f7a186d..b1479a0 100644
--- a/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir

@@ -1,103 +1,108 @@
 // RUN: iree-opt %s
 
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
-  %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %reduction = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @codegen(
+      %variant_op: !transform.any_op {transform.consumed}) {
 
-  // Step 1. First level of tiling + fusion parallelizes to blocks.
-  // ===========================================================================
-  %grid_reduction, %forall_grid =
-    transform.structured.tile_using_forall %reduction tile_sizes [1]
-      ( mapping = [#gpu.block<x>] )
+    %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    %reduction = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+
+    // Step 1. First level of tiling + fusion parallelizes to blocks.
+    // ===========================================================================
+    %grid_reduction, %forall_grid =
+      transform.structured.tile_using_forall %reduction tile_sizes [1]
+        ( mapping = [#gpu.block<x>] )
+        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
+    transform.structured.fuse_into_containing_op %fill into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    // Step 2. Split the reduction to get meatier parallelism.
+    // ===========================================================================
+    %block_more_parallel_fill_op_2, %block_more_parallel_op_2, %block_combiner_op_2, %forall = 
+      transform.structured.tile_reduction_using_for %grid_reduction by tile_sizes = [0, 128]
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+    %_1:2 =
+      transform.structured.tile_using_forall %block_more_parallel_op_2 num_threads [0, 32]
+      ( mapping = [#gpu.thread<x>] )
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
-  transform.structured.fuse_into_containing_op %fill into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
 
-  // Step 2. Split the reduction to get meatier parallelism.
-  // ===========================================================================
-  %block_more_parallel_fill_op_2, %block_more_parallel_op_2, %block_combiner_op_2, %forall = 
-    transform.structured.tile_reduction_using_for %grid_reduction by tile_sizes = [0, 128]
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-  %_1:2 =
-    transform.structured.tile_using_forall %block_more_parallel_op_2 num_threads [0, 32]
-    ( mapping = [#gpu.thread<x>] )
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    // Step 3. Second level of tiling parallelizes to threads.
+    // ===========================================================================
+    // 1st op is [parallel, parallel], map it to threadIdx.x by 4.
+    %_2:2 =
+      transform.structured.tile_using_forall %block_more_parallel_fill_op_2 tile_sizes [0, 4]
+      ( mapping = [#gpu.thread<x>] )
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    // 2nd op is [parallel, reduction] of 1x128, map the 1-dim to threadIdx.y to
+    // trigger mapping of the reduction to threadIdx.x via predication via `if (x==0)`.
+    %_3:2 =
+      transform.structured.tile_using_forall %block_combiner_op_2 tile_sizes [1] 
+      ( mapping = [#gpu.thread<y>] )
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
-  // Step 3. Second level of tiling parallelizes to threads.
-  // ===========================================================================
-  // 1st op is [parallel, parallel], map it to threadIdx.x by 4.
-  %_2:2 =
-    transform.structured.tile_using_forall %block_more_parallel_fill_op_2 tile_sizes [0, 4]
-    ( mapping = [#gpu.thread<x>] )
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  // 2nd op is [parallel, reduction] of 1x128, map the 1-dim to threadIdx.y to
-  // trigger mapping of the reduction to threadIdx.x via predication via `if (x==0)`.
-  %_3:2 =
-    transform.structured.tile_using_forall %block_combiner_op_2 tile_sizes [1] 
-    ( mapping = [#gpu.thread<y>] )
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    // Step 4. Rank-reduce and vectorize.
+    // ===========================================================================
+    %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+      transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+      transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+    } : !transform.any_op
+    %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
 
-  // Step 4. Rank-reduce and vectorize.
-  // ===========================================================================
-  %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %func {
-    transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-    transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-    transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-  } : !transform.any_op
-  %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
+    // Step 5. Bufferize and drop HAL decriptor from memref ops.
+    // ===========================================================================
+    // Canonicalization/CSE is needed before bufferization otherwise unnecessary
+    // allocs will be created.
+    transform.apply_patterns to %func_3 {
+      transform.apply_patterns.iree.fold_fill_into_pad
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.scf.for_loop_canonicalization
+    } : !transform.any_op
+    transform.apply_patterns to %func_3 {
+      transform.apply_patterns.tensor.reassociative_reshape_folding
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.iree.apply_cse %func_3 : !transform.any_op
+    transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+    %func_5 = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func_5 {
+      transform.apply_patterns.linalg.erase_unnecessary_inputs
+    } : !transform.any_op
+    %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
+    %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
 
-  // Step 5. Bufferize and drop HAL decriptor from memref ops.
-  // ===========================================================================
-  // Canonicalization/CSE is needed before bufferization otherwise unnecessary
-  // allocs will be created.
-  transform.apply_patterns to %func_3 {
-    transform.apply_patterns.iree.fold_fill_into_pad
-    transform.apply_patterns.linalg.tiling_canonicalization
-    transform.apply_patterns.scf.for_loop_canonicalization
-  } : !transform.any_op
-  transform.apply_patterns to %func_3 {
-    transform.apply_patterns.tensor.reassociative_reshape_folding
-    transform.apply_patterns.canonicalization
-  } : !transform.any_op
-  transform.iree.apply_cse %func_3 : !transform.any_op
-  transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
-  %func_5 = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %func_5 {
-    transform.apply_patterns.linalg.erase_unnecessary_inputs
-  } : !transform.any_op
-  %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
-  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    // Step 6. Post-bufferization mapping to blocks and threads.
+    // ===========================================================================
+    %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
+    transform.iree.map_nested_forall_to_gpu_threads %func_7
+        workgroup_dims = [32, 1, 1] : (!transform.any_op) -> ()
 
-  // Step 6. Post-bufferization mapping to blocks and threads.
-  // ===========================================================================
-  %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-  transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
-  transform.iree.map_nested_forall_to_gpu_threads %func_7
-      workgroup_dims = [32, 1, 1] : (!transform.any_op) -> ()
+    // Step 7. Post-bufferization vector distribution with rank-reduction.
+    // ===========================================================================
+    transform.apply_patterns to %func_7 {
+      transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+      transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+      transform.apply_patterns.memref.fold_memref_alias_ops
+      transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+    } : !transform.any_op
+    %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3 
+      : (!transform.any_op) -> !transform.any_op
+    %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 } : (!transform.any_op) -> !transform.any_op
+    transform.iree.vector.warp_distribute %func_7
+      : (!transform.any_op) -> ()
 
-  // Step 7. Post-bufferization vector distribution with rank-reduction.
-  // ===========================================================================
-  transform.apply_patterns to %func_7 {
-    transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-    transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-    transform.apply_patterns.memref.fold_memref_alias_ops
-    transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-  } : !transform.any_op
-  %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3 
-    : (!transform.any_op) -> !transform.any_op
-  %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 } : (!transform.any_op) -> !transform.any_op
-  transform.iree.vector.warp_distribute %func_7
-    : (!transform.any_op) -> ()
-
-  // Late canonicalizations to cleanup and pass the checks
-  transform.apply_patterns to %func_7 {
-    transform.apply_patterns.iree.fold_fill_into_pad
-    transform.apply_patterns.linalg.tiling_canonicalization
-    transform.apply_patterns.scf.for_loop_canonicalization
-    transform.apply_patterns.canonicalization
-  } : !transform.any_op
-  transform.iree.apply_licm %func_7 : !transform.any_op
-  transform.iree.apply_cse %func_7 : !transform.any_op
-}
+    // Late canonicalizations to cleanup and pass the checks
+    transform.apply_patterns to %func_7 {
+      transform.apply_patterns.iree.fold_fill_into_pad
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.scf.for_loop_canonicalization
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.iree.apply_licm %func_7 : !transform.any_op
+    transform.iree.apply_cse %func_7 : !transform.any_op
+  
+    transform.yield 
+  }
+} // module

diff --git a/tests/transform_dialect/cuda/reduction_v2_uneven.mlir b/tests/transform_dialect/cuda/reduction_v2_uneven.mlir
index ebc5d8a..29b2d48 100644
--- a/tests/transform_dialect/cuda/reduction_v2_uneven.mlir
+++ b/tests/transform_dialect/cuda/reduction_v2_uneven.mlir

@@ -25,12 +25,14 @@
 // RUN:     --iree-hal-configuration-pipeline | \
 // RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-lower-executable-target)))' \
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=%p/reduction_v2_codegen_spec.mlir | \
+// RUN:     --iree-codegen-transform-dialect-library=%p/reduction_v2_codegen_spec.mlir \
+// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
 // RUN: FileCheck %s --check-prefix=CHECK
 
 // RUN: iree-compile %s --iree-hal-target-backends=cuda \
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=%p/reduction_v2_codegen_spec.mlir | \
+// RUN:     --iree-codegen-transform-dialect-library=%p/reduction_v2_codegen_spec.mlir \
+// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
 // RUN: iree-run-module --module=- --function=reduce --device=cuda --input="33x34567xf32=1" |\
 // RUN: FileCheck %s --check-prefix=EXEC
 

diff --git a/tests/transform_dialect/cuda/softmax.mlir b/tests/transform_dialect/cuda/softmax.mlir
index 299b5cb..27464db 100644
--- a/tests/transform_dialect/cuda/softmax.mlir
+++ b/tests/transform_dialect/cuda/softmax.mlir

@@ -6,7 +6,8 @@
 // RUN:     --iree-stream-transformation-pipeline \
 // RUN:     --iree-hal-configuration-pipeline | \
 // RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-lower-executable-target)))' \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=%p/softmax_codegen_spec.mlir \
+// RUN:     --iree-codegen-transform-dialect-library=%p/softmax_codegen_spec.mlir \
+// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen \
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false | \
 // RUN: FileCheck %s --check-prefix=CHECK-SHUFFLE
 
@@ -16,7 +17,8 @@
 // RUN:     --iree-opt-const-expr-hoisting=false --iree-opt-const-eval=false \
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
 // RUN:     --iree-flow-dispatch-use-transform-dialect=%p/softmax_dispatch_spec.mlir \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=%p/softmax_codegen_spec.mlir | \
+// RUN:     --iree-codegen-transform-dialect-library=%p/softmax_codegen_spec.mlir \
+// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
 // RUN: iree-run-module --module=- --function=softmax --device=cuda | \
 // RUN: FileCheck %s
 

diff --git a/tests/transform_dialect/cuda/softmax_codegen_spec.mlir b/tests/transform_dialect/cuda/softmax_codegen_spec.mlir
index 4c71f83..345be1f 100644
--- a/tests/transform_dialect/cuda/softmax_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/softmax_codegen_spec.mlir

@@ -1,109 +1,114 @@
 // RUN: iree-opt %s
 
 // Codegen
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
-  %ops = transform.structured.match ops{["linalg.fill", "linalg.generic"]}
-    in %variant_op : (!transform.any_op) -> !transform.any_op
-  %input_max_fill,
-  %input_max,
-  %exps_sum_fill,
-  %exps,
-  %exps_sum,
-  %div = transform.split_handle %ops
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op,
-                           !transform.any_op, !transform.any_op, !transform.any_op)
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @codegen(
+      %variant_op: !transform.any_op {transform.consumed}) {
 
-  // Step 1. First level of tiling + fusion parallelizes to blocks.
-  // ==============================================================
-  %_, %forall =
-    transform.structured.tile_using_forall %div tile_sizes [1, 4]
-      ( mapping = [#gpu.block<x>, #gpu.block<y>] )
+    %ops = transform.structured.match ops{["linalg.fill", "linalg.generic"]}
+      in %variant_op : (!transform.any_op) -> !transform.any_op
+    %input_max_fill,
+    %input_max,
+    %exps_sum_fill,
+    %exps,
+    %exps_sum,
+    %div = transform.split_handle %ops
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op,
+                            !transform.any_op, !transform.any_op, !transform.any_op)
+
+    // Step 1. First level of tiling + fusion parallelizes to blocks.
+    // ==============================================================
+    %_, %forall =
+      transform.structured.tile_using_forall %div tile_sizes [1, 4]
+        ( mapping = [#gpu.block<x>, #gpu.block<y>] )
+        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall : (!transform.any_op) -> ()
+
+    // TODO: Merging and fusing merged handles does not work properly atm.
+    transform.structured.fuse_into_containing_op %exps_sum into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.structured.fuse_into_containing_op %exps into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.structured.fuse_into_containing_op %exps_sum_fill into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.structured.fuse_into_containing_op %input_max into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.structured.fuse_into_containing_op %input_max_fill into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    // By default, fusion into scf.forall does not promote captured values
+    // to shared as this involves a cross-thread dependence analysis.
+    // Instead, we activate it explicitly post-hoc to promote all the extract_slice
+    // ops that we find and match the prerequisites
+    %forall_with_type = transform.cast %forall : !transform.any_op to !transform.op<"scf.forall">
+    transform.iree.share_forall_operands %forall_with_type
+      : (!transform.op<"scf.forall">) -> !transform.op<"scf.forall">
+    transform.apply_patterns to %variant_op {
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.iree.apply_cse %variant_op : !transform.any_op
+
+    // Step 2. Second level of tiling + fusion parallelizes to threads.
+    // ================================================================
+    %tiled_ops = transform.structured.match ops{["linalg.fill", "linalg.generic"]}
+      in %variant_op : (!transform.any_op) -> !transform.any_op
+    %tiled_input_max_fill,
+    %tiled_input_max,
+    %tiled_exps_sum_fill,
+    %tiled_exp_and_exps_sum,
+    %tiled_exp_and_exps_sum_2,
+    %tiled_div = transform.split_handle %tiled_ops
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op,
+                            !transform.any_op, !transform.any_op, !transform.any_op)
+    // Leaving the reduction untiled on threadIdx.x makes it sequential on
+    // threadIdx.x. After distribution, predication by `if (threadIdx.x == 0)` is
+    // introduced and opportunities for distributing vector ops across warps
+    // appear.
+    %reduction_linalg_ops = transform.merge_handles %tiled_input_max,
+                                                    %tiled_exp_and_exps_sum,
+                                                    %tiled_exp_and_exps_sum_2
+      : !transform.any_op
+    transform.structured.tile_using_forall %reduction_linalg_ops tile_sizes [1, 1]
+      ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall : (!transform.any_op) -> ()
+    // Fully parallel ops are tiled and mapped.
+    %parallel_linalg_ops = transform.merge_handles %tiled_input_max_fill,
+                                                  %tiled_exps_sum_fill,
+                                                  %tiled_div
+      : !transform.any_op
+    transform.structured.tile_using_forall %parallel_linalg_ops num_threads [1, 4, 32]
+        ( mapping = [#gpu.thread<z>, #gpu.thread<y>, #gpu.thread<x>] )
+        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
-  // TODO: Merging and fusing merged handles does not work properly atm.
-  transform.structured.fuse_into_containing_op %exps_sum into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.structured.fuse_into_containing_op %exps into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.structured.fuse_into_containing_op %exps_sum_fill into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.structured.fuse_into_containing_op %input_max into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.structured.fuse_into_containing_op %input_max_fill into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  // By default, fusion into scf.forall does not promote captured values
-  // to shared as this involves a cross-thread dependence analysis.
-  // Instead, we activate it explicitly post-hoc to promote all the extract_slice
-  // ops that we find and match the prerequisites
-  %forall_with_type = transform.cast %forall : !transform.any_op to !transform.op<"scf.forall">
-  transform.iree.share_forall_operands %forall_with_type
-    : (!transform.op<"scf.forall">) -> !transform.op<"scf.forall">
-  transform.apply_patterns to %variant_op {
-    transform.apply_patterns.canonicalization
-  } : !transform.any_op
-  transform.iree.apply_cse %variant_op : !transform.any_op
+    // Step 3. Rank-reduce and vectorize.
+    // ==================================
+    %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+      transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+      transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+    } : !transform.any_op
+    transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
 
-  // Step 2. Second level of tiling + fusion parallelizes to threads.
-  // ================================================================
-  %tiled_ops = transform.structured.match ops{["linalg.fill", "linalg.generic"]}
-    in %variant_op : (!transform.any_op) -> !transform.any_op
-  %tiled_input_max_fill,
-  %tiled_input_max,
-  %tiled_exps_sum_fill,
-  %tiled_exp_and_exps_sum,
-  %tiled_exp_and_exps_sum_2,
-  %tiled_div = transform.split_handle %tiled_ops
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op,
-                           !transform.any_op, !transform.any_op, !transform.any_op)
-  // Leaving the reduction untiled on threadIdx.x makes it sequential on
-  // threadIdx.x. After distribution, predication by `if (threadIdx.x == 0)` is
-  // introduced and opportunities for distributing vector ops across warps
-  // appear.
-  %reduction_linalg_ops = transform.merge_handles %tiled_input_max,
-                                                  %tiled_exp_and_exps_sum,
-                                                  %tiled_exp_and_exps_sum_2
-    : !transform.any_op
-  transform.structured.tile_using_forall %reduction_linalg_ops tile_sizes [1, 1]
-    ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  // Fully parallel ops are tiled and mapped.
-  %parallel_linalg_ops = transform.merge_handles %tiled_input_max_fill,
-                                                 %tiled_exps_sum_fill,
-                                                 %tiled_div
-    : !transform.any_op
-  transform.structured.tile_using_forall %parallel_linalg_ops num_threads [1, 4, 32]
-      ( mapping = [#gpu.thread<z>, #gpu.thread<y>, #gpu.thread<x>] )
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    // Step 4. Bufferize and drop HAL decriptor from memref ops.
+    // =========================================================
+    transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+    %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> !transform.any_op
+    %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
 
-  // Step 3. Rank-reduce and vectorize.
-  // ==================================
-  %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %func {
-    transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-    transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-    transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-  } : !transform.any_op
-  transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
+    // Step 5. Post-bufferization mapping to blocks and threads.
+    // =========================================================
+    %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    transform.iree.forall_to_workgroup %func_2 : (!transform.any_op) -> ()
+    transform.iree.map_nested_forall_to_gpu_threads %func_2 workgroup_dims = [32, 4, 1] : (!transform.any_op) -> ()
 
-  // Step 4. Bufferize and drop HAL decriptor from memref ops.
-  // =========================================================
-  transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
-  %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> !transform.any_op
-  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    // Step 6. Post-bufferization vector distribution with rank-reduction.
+    // ===================================================================
+    %end_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %end_func {
+      transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+      transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+      transform.apply_patterns.memref.fold_memref_alias_ops
+      transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+    } : !transform.any_op
+    %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 } : (!transform.any_op) -> !transform.any_op
+    transform.iree.vector.warp_distribute %end_func : (!transform.any_op) -> ()
 
-  // Step 5. Post-bufferization mapping to blocks and threads.
-  // =========================================================
-  %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-  transform.iree.forall_to_workgroup %func_2 : (!transform.any_op) -> ()
-  transform.iree.map_nested_forall_to_gpu_threads %func_2 workgroup_dims = [32, 4, 1] : (!transform.any_op) -> ()
-
-  // Step 6. Post-bufferization vector distribution with rank-reduction.
-  // ===================================================================
-  %end_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %end_func {
-    transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-    transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-    transform.apply_patterns.memref.fold_memref_alias_ops
-    transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-  } : !transform.any_op
-  %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-  %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 } : (!transform.any_op) -> !transform.any_op
-  transform.iree.vector.warp_distribute %end_func : (!transform.any_op) -> ()
-}
+    transform.yield 
+  }
+} // module

diff --git a/tests/transform_dialect/cuda/softmax_partial.mlir b/tests/transform_dialect/cuda/softmax_partial.mlir
index 6f4ca42..91032cb 100644
--- a/tests/transform_dialect/cuda/softmax_partial.mlir
+++ b/tests/transform_dialect/cuda/softmax_partial.mlir

@@ -5,7 +5,8 @@
 // RUN:     --iree-stream-transformation-pipeline \
 // RUN:     --iree-hal-configuration-pipeline | \
 // RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-lower-executable-target)))' \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=%p/softmax_partial_codegen_spec.mlir \
+// RUN:     --iree-codegen-transform-dialect-library=%p/softmax_partial_codegen_spec.mlir \
+// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen \
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false | \
 // RUN: FileCheck %s --check-prefix=CHECK-SHUFFLE
 
@@ -14,7 +15,8 @@
 /// Constant JIT'ing must be disabled because the transform-dialect debug
 /// flags leak to the JIT session, which doesn't know what to do with them.
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=%p/softmax_partial_codegen_spec.mlir | \
+// RUN:     --iree-codegen-transform-dialect-library=%p/softmax_partial_codegen_spec.mlir \
+// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
 // RUN: iree-run-module --module=- --function=softmax_partial --device=cuda | \
 // RUN: FileCheck %s
 

diff --git a/tests/transform_dialect/cuda/softmax_partial_codegen_spec.mlir b/tests/transform_dialect/cuda/softmax_partial_codegen_spec.mlir
index 5f8175c..7c1564f 100644
--- a/tests/transform_dialect/cuda/softmax_partial_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/softmax_partial_codegen_spec.mlir

@@ -1,93 +1,97 @@
 // RUN: iree-opt %s
 
 // Codegen
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @codegen(
+      %variant_op: !transform.any_op {transform.consumed}) {
 
-  // Step 1. First level of tiling + fusion parallelizes to blocks.
-  // ==============================================================
-  %root = transform.structured.match interface{LinalgOp}
-    attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %red = transform.structured.match interface{LinalgOp}
-    attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %not_root = merge_handles %fill, %red : !transform.any_op
-  %tiled_generic, %forall =
-    transform.structured.tile_using_forall %root tile_sizes [1, 4]
-    ( mapping = [#gpu.block<x>, #gpu.block<y>] )
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall : (!transform.any_op) -> ()
-  transform.structured.fuse_into_containing_op %not_root into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  // Step 2. Second level of tiling + fusion parallelizes to threads.
-  // ================================================================
-  %fill_linalg = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %reduction_linalg = transform.structured.match ops{["linalg.generic"]}
-    attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %parallel_linalg = transform.structured.match ops{["linalg.generic"]}
-    attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %tiled_reduction_generic, %forall_reduction =
-    transform.structured.tile_using_forall %reduction_linalg tile_sizes [1, 1]
-      ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
+    // Step 1. First level of tiling + fusion parallelizes to blocks.
+    // ==============================================================
+    %root = transform.structured.match interface{LinalgOp}
+      attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    %red = transform.structured.match interface{LinalgOp}
+      attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    %not_root = transform.merge_handles %fill, %red : !transform.any_op
+    %tiled_generic, %forall =
+      transform.structured.tile_using_forall %root tile_sizes [1, 4]
+      ( mapping = [#gpu.block<x>, #gpu.block<y>] )
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  // TODO: this fusion currently does not happen properly, this is related to the clone
-  // behavior when fusing into scf.forall.
-  // Once fixed we'll be able to fuse.
-  // Fusion will save us one roundtrip to memory.
-  // transform.structured.fuse_into_containing_op %fill_linalg into %forall_reduction
-  transform.structured.tile_using_forall %parallel_linalg num_threads [1, 4, 32]
-      ( mapping = [#gpu.thread<z>, #gpu.thread<y>, #gpu.thread<x>] )
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall : (!transform.any_op) -> ()
+    transform.structured.fuse_into_containing_op %not_root into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    // Step 2. Second level of tiling + fusion parallelizes to threads.
+    // ================================================================
+    %fill_linalg = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    %reduction_linalg = transform.structured.match ops{["linalg.generic"]}
+      attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    %parallel_linalg = transform.structured.match ops{["linalg.generic"]}
+      attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    %tiled_reduction_generic, %forall_reduction =
+      transform.structured.tile_using_forall %reduction_linalg tile_sizes [1, 1]
+        ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
+        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    // TODO: this fusion currently does not happen properly, this is related to the clone
+    // behavior when fusing into scf.forall.
+    // Once fixed we'll be able to fuse.
+    // Fusion will save us one roundtrip to memory.
+    // transform.structured.fuse_into_containing_op %fill_linalg into %forall_reduction
+    transform.structured.tile_using_forall %parallel_linalg num_threads [1, 4, 32]
+        ( mapping = [#gpu.thread<z>, #gpu.thread<y>, #gpu.thread<x>] )
+        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
 
-  // Inability to tile reductions to scf.forall has 2 implications:
-  //   1. since no scf.forall is present, no gpu.barrier is added.
-  //      This should be fixed independently: ops that are not nested in an scf.forall
-  //      should have a gpu.barrier. Later needs to be complemented by a barrier
-  //      removal pass.
-  //   2. Similarly, needs to be predicated under an if threadIx == 0 to avoid
-  //      multiple threads updating the buffer inplace once bufferized.
-  //
-  // Instead, we can vectorize and go to vector SSA values that sidestep these
-  // issues.
-  // Everyone will race to the write while still computing the same value.
-  //
-  // That is still not good enough because we need to predicate this in order
-  // to enable the parallel reduction on warps.
+    // Inability to tile reductions to scf.forall has 2 implications:
+    //   1. since no scf.forall is present, no gpu.barrier is added.
+    //      This should be fixed independently: ops that are not nested in an scf.forall
+    //      should have a gpu.barrier. Later needs to be complemented by a barrier
+    //      removal pass.
+    //   2. Similarly, needs to be predicated under an if threadIx == 0 to avoid
+    //      multiple threads updating the buffer inplace once bufferized.
+    //
+    // Instead, we can vectorize and go to vector SSA values that sidestep these
+    // issues.
+    // Everyone will race to the write while still computing the same value.
+    //
+    // That is still not good enough because we need to predicate this in order
+    // to enable the parallel reduction on warps.
 
-  // Step 3. Rank-reduce and vectorize.
-  // ==================================
-  %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %func {
-    transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-    transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-    transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-  } : !transform.any_op
-  transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
+    // Step 3. Rank-reduce and vectorize.
+    // ==================================
+    %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+      transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+      transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+    } : !transform.any_op
+    transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
 
-  // Step 4. Bufferize and drop HAL decriptor from memref ops.
-  // =========================================================
-  transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
-  %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> !transform.any_op
-  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    // Step 4. Bufferize and drop HAL decriptor from memref ops.
+    // =========================================================
+    transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+    %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> !transform.any_op
+    %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
 
-  // Step 5. Post-bufferization mapping to blocks and threads.
-  // =========================================================
-  %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-  transform.iree.forall_to_workgroup %func_2 : (!transform.any_op) -> ()
-  transform.iree.map_nested_forall_to_gpu_threads %func_2 workgroup_dims = [32, 4, 1] : (!transform.any_op) -> ()
+    // Step 5. Post-bufferization mapping to blocks and threads.
+    // =========================================================
+    %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    transform.iree.forall_to_workgroup %func_2 : (!transform.any_op) -> ()
+    transform.iree.map_nested_forall_to_gpu_threads %func_2 workgroup_dims = [32, 4, 1] : (!transform.any_op) -> ()
 
-  // Step 6. Post-bufferization vector distribution with rank-reduction.
-  // ===================================================================
-  %end_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %end_func {
-    transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-    transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-    transform.apply_patterns.memref.fold_memref_alias_ops
-    transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-  } : !transform.any_op
-  %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-  %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 }
-    : (!transform.any_op) -> !transform.any_op
-  transform.iree.vector.warp_distribute %end_func : (!transform.any_op) -> ()
-}
+    // Step 6. Post-bufferization vector distribution with rank-reduction.
+    // ===================================================================
+    %end_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %end_func {
+      transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+      transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+      transform.apply_patterns.memref.fold_memref_alias_ops
+      transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+    } : !transform.any_op
+    %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 }
+      : (!transform.any_op) -> !transform.any_op
+    transform.iree.vector.warp_distribute %end_func : (!transform.any_op) -> ()
+
+    transform.yield 
+  }
+} // module

diff --git a/tests/transform_dialect/cuda/softmax_v2.mlir b/tests/transform_dialect/cuda/softmax_v2.mlir
index 2a556d1..07e3c28 100644
--- a/tests/transform_dialect/cuda/softmax_v2.mlir
+++ b/tests/transform_dialect/cuda/softmax_v2.mlir

@@ -6,7 +6,8 @@
 // RUN:     --iree-hal-configuration-pipeline | \
 // RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-lower-executable-target)))' \
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=%p/softmax_v2_codegen_spec.mlir | \
+// RUN:     --iree-codegen-transform-dialect-library=%p/softmax_v2_codegen_spec.mlir \
+// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
 // RUN: FileCheck %s --check-prefix=CHECK-SHUFFLE
 
 // RUN: iree-compile %s --iree-hal-target-backends=cuda \
@@ -15,7 +16,8 @@
 /// flags leak to the JIT session, which doesn't know what to do with them.
 // RUN:     --iree-flow-fuse-multi-use \
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=%p/softmax_v2_codegen_spec.mlir | \
+// RUN:     --iree-codegen-transform-dialect-library=%p/softmax_v2_codegen_spec.mlir \
+// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
 // RUN: iree-run-module --module=- --function=softmax --device=cuda | \
 // RUN: FileCheck %s
 

diff --git a/tests/transform_dialect/cuda/softmax_v2_codegen_spec.mlir b/tests/transform_dialect/cuda/softmax_v2_codegen_spec.mlir
index dda89bf..67e3cb3 100644
--- a/tests/transform_dialect/cuda/softmax_v2_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/softmax_v2_codegen_spec.mlir

@@ -1,138 +1,143 @@
 // RUN: iree-opt %s
 
 // Codegen
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
-  %ops = transform.structured.match ops{["linalg.fill", "linalg.generic"]}
-    in %variant_op : (!transform.any_op) -> !transform.any_op
-  %input_max_fill,
-  %input_max,
-  %exps_sum_fill,
-  %exp_and_exps_sum,
-  %div = transform.split_handle %ops
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op,
-                           !transform.any_op, !transform.any_op)
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @codegen(
+      %variant_op: !transform.any_op {transform.consumed}) {
 
-  // Step 1. First level of tiling + fusion parallelizes to blocks.
-  // ==============================================================
-  %_, %forall =
-  transform.structured.tile_using_forall %div tile_sizes [1, 4]
-    ( mapping = [#gpu.block<x>, #gpu.block<y>] )
-     : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall : (!transform.any_op) -> ()
+    %ops = transform.structured.match ops{["linalg.fill", "linalg.generic"]}
+      in %variant_op : (!transform.any_op) -> !transform.any_op
+    %input_max_fill,
+    %input_max,
+    %exps_sum_fill,
+    %exp_and_exps_sum,
+    %div = transform.split_handle %ops
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op,
+                            !transform.any_op, !transform.any_op)
 
-  // TODO: Merging and fusing merged handles does not work properly atm.
-  transform.structured.fuse_into_containing_op %exp_and_exps_sum into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.structured.fuse_into_containing_op %exps_sum_fill into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.structured.fuse_into_containing_op %input_max into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.structured.fuse_into_containing_op %input_max_fill into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  // By default, fusion into scf.forall does not promote captured values
-  // to shared as this involves a cross-thread dependence analysis.
-  // Instead, we activate it explicitly post-hoc to promote all the extract_slice
-  // ops that we find and match the prerequisites
-  %forall_with_type = transform.cast %forall : !transform.any_op to !transform.op<"scf.forall">
-  transform.iree.share_forall_operands %forall_with_type
-    : (!transform.op<"scf.forall">) -> !transform.op<"scf.forall">
+    // Step 1. First level of tiling + fusion parallelizes to blocks.
+    // ==============================================================
+    %_, %forall =
+    transform.structured.tile_using_forall %div tile_sizes [1, 4]
+      ( mapping = [#gpu.block<x>, #gpu.block<y>] )
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall : (!transform.any_op) -> ()
 
-  // Canonicalizations.
-  %func_op = transform.structured.match ops{["func.func"]} in %variant_op
+    // TODO: Merging and fusing merged handles does not work properly atm.
+    transform.structured.fuse_into_containing_op %exp_and_exps_sum into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.structured.fuse_into_containing_op %exps_sum_fill into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.structured.fuse_into_containing_op %input_max into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.structured.fuse_into_containing_op %input_max_fill into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+    // By default, fusion into scf.forall does not promote captured values
+    // to shared as this involves a cross-thread dependence analysis.
+    // Instead, we activate it explicitly post-hoc to promote all the extract_slice
+    // ops that we find and match the prerequisites
+    %forall_with_type = transform.cast %forall : !transform.any_op to !transform.op<"scf.forall">
+    transform.iree.share_forall_operands %forall_with_type
+      : (!transform.op<"scf.forall">) -> !transform.op<"scf.forall">
+
+    // Canonicalizations.
+    %func_op = transform.structured.match ops{["func.func"]} in %variant_op
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func_op {
+      transform.apply_patterns.iree.fold_fill_into_pad
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.scf.for_loop_canonicalization
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.iree.apply_licm %func_op : !transform.any_op
+    transform.iree.apply_cse %func_op : !transform.any_op
+
+
+    // Step 2. Second level of tiling + fusion parallelizes to threads.
+    // ================================================================
+    %tiled_ops = transform.structured.match ops{["linalg.fill", "linalg.generic"]}
+      in %variant_op : (!transform.any_op) -> !transform.any_op
+    %tiled_input_max_fill,
+    %tiled_input_max,
+    %tiled_exps_sum_fill,
+    %tiled_exp_and_exps_sum,
+    %tiled_div = transform.split_handle %tiled_ops
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op,
+                            !transform.any_op, !transform.any_op)
+    // Leaving the reduction untiled on threadIdx.x makes it sequential on
+    // threadIdx.x. After distribution, predication by `if (threadIdx.x == 0)` is
+    // introduced and opportunities for distributing vector ops across warps
+    // appear.
+    %reduction_linalg_ops = transform.merge_handles %tiled_input_max,
+                                                    %tiled_exp_and_exps_sum
+      : !transform.any_op
+    transform.structured.tile_using_forall %reduction_linalg_ops tile_sizes [1, 1]
+      ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    // Fully parallel ops are tiled and mapped.
+    %parallel_linalg_ops = transform.merge_handles %tiled_input_max_fill,
+                                                  %tiled_exps_sum_fill,
+                                                  %tiled_div
+      : !transform.any_op
+    transform.structured.tile_using_forall %parallel_linalg_ops num_threads [1, 4, 32]
+      ( mapping = [#gpu.thread<z>, #gpu.thread<y>, #gpu.thread<x>] )
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    // Canonicalizations.
+    transform.apply_patterns to %func_op {
+      transform.apply_patterns.iree.fold_fill_into_pad
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.scf.for_loop_canonicalization
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.iree.apply_licm %func_op : !transform.any_op
+    transform.iree.apply_cse %func_op : !transform.any_op
+
+    // Step 3. Rank-reduce and vectorize.
+    // ==================================
+    transform.apply_patterns to %func_op {
+      transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+      transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+      transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+    } : !transform.any_op
+    transform.structured.vectorize_children_and_apply_patterns %func_op : (!transform.any_op) -> !transform.any_op
+
+    // Step 4. Bufferize and drop HAL decriptor from memref ops.
+    // =========================================================
+    transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+    %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> !transform.any_op
+    %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+
+    // Step 5. Post-bufferization mapping to blocks and threads.
+    // =========================================================
+    transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> ()
+    transform.iree.map_nested_forall_to_gpu_threads %memref_func
+      workgroup_dims = [32, 4, 1] : (!transform.any_op) -> ()
+
+    // Step 6. Post-bufferization vector distribution with rank-reduction.
+    // ===================================================================
+    transform.apply_patterns to %memref_func {
+      transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+      transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+      transform.apply_patterns.memref.fold_memref_alias_ops
+      transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+    } : !transform.any_op
+    %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3
       : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %func_op {
-    transform.apply_patterns.iree.fold_fill_into_pad
-    transform.apply_patterns.linalg.tiling_canonicalization
-    transform.apply_patterns.scf.for_loop_canonicalization
-    transform.apply_patterns.canonicalization
-  } : !transform.any_op
-  transform.iree.apply_licm %func_op : !transform.any_op
-  transform.iree.apply_cse %func_op : !transform.any_op
-
-
-  // Step 2. Second level of tiling + fusion parallelizes to threads.
-  // ================================================================
-  %tiled_ops = transform.structured.match ops{["linalg.fill", "linalg.generic"]}
-    in %variant_op : (!transform.any_op) -> !transform.any_op
-  %tiled_input_max_fill,
-  %tiled_input_max,
-  %tiled_exps_sum_fill,
-  %tiled_exp_and_exps_sum,
-  %tiled_div = transform.split_handle %tiled_ops
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op,
-                           !transform.any_op, !transform.any_op)
-  // Leaving the reduction untiled on threadIdx.x makes it sequential on
-  // threadIdx.x. After distribution, predication by `if (threadIdx.x == 0)` is
-  // introduced and opportunities for distributing vector ops across warps
-  // appear.
-  %reduction_linalg_ops = transform.merge_handles %tiled_input_max,
-                                                  %tiled_exp_and_exps_sum
-    : !transform.any_op
-  transform.structured.tile_using_forall %reduction_linalg_ops tile_sizes [1, 1]
-    ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  // Fully parallel ops are tiled and mapped.
-  %parallel_linalg_ops = transform.merge_handles %tiled_input_max_fill,
-                                                 %tiled_exps_sum_fill,
-                                                 %tiled_div
-    : !transform.any_op
-  transform.structured.tile_using_forall %parallel_linalg_ops num_threads [1, 4, 32]
-    ( mapping = [#gpu.thread<z>, #gpu.thread<y>, #gpu.thread<x>] )
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  // Canonicalizations.
-  transform.apply_patterns to %func_op {
-    transform.apply_patterns.iree.fold_fill_into_pad
-    transform.apply_patterns.linalg.tiling_canonicalization
-    transform.apply_patterns.scf.for_loop_canonicalization
-    transform.apply_patterns.canonicalization
-  } : !transform.any_op
-  transform.iree.apply_licm %func_op : !transform.any_op
-  transform.iree.apply_cse %func_op : !transform.any_op
-
-  // Step 3. Rank-reduce and vectorize.
-  // ==================================
-  transform.apply_patterns to %func_op {
-    transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-    transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-    transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-  } : !transform.any_op
-  transform.structured.vectorize_children_and_apply_patterns %func_op : (!transform.any_op) -> !transform.any_op
-
-  // Step 4. Bufferize and drop HAL decriptor from memref ops.
-  // =========================================================
-  transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
-  %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> !transform.any_op
-  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-
-  // Step 5. Post-bufferization mapping to blocks and threads.
-  // =========================================================
-  transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> ()
-  transform.iree.map_nested_forall_to_gpu_threads %memref_func
-    workgroup_dims = [32, 4, 1] : (!transform.any_op) -> ()
-
-  // Step 6. Post-bufferization vector distribution with rank-reduction.
-  // ===================================================================
-  transform.apply_patterns to %memref_func {
-    transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-    transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-    transform.apply_patterns.memref.fold_memref_alias_ops
-    transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-  } : !transform.any_op
-  %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3
-    : (!transform.any_op) -> !transform.any_op
-  %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 }
-    : (!transform.any_op) -> !transform.any_op
-  transform.iree.vector.warp_distribute %memref_func
-    : (!transform.any_op) -> ()
-
-
-  // Late canonicalizations.
-  %func_op_3 = transform.structured.match ops{["func.func"]} in %variant_op_3
+    %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 }
       : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %func_op_3 {
-    transform.apply_patterns.iree.fold_fill_into_pad
-    transform.apply_patterns.linalg.tiling_canonicalization
-    transform.apply_patterns.scf.for_loop_canonicalization
-    transform.apply_patterns.canonicalization
-  } : !transform.any_op
-  transform.iree.apply_licm %func_op_3 : !transform.any_op
-  transform.iree.apply_cse %func_op_3 : !transform.any_op
-}
+    transform.iree.vector.warp_distribute %memref_func
+      : (!transform.any_op) -> ()
+
+
+    // Late canonicalizations.
+    %func_op_3 = transform.structured.match ops{["func.func"]} in %variant_op_3
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func_op_3 {
+      transform.apply_patterns.iree.fold_fill_into_pad
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.scf.for_loop_canonicalization
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.iree.apply_licm %func_op_3 : !transform.any_op
+    transform.iree.apply_cse %func_op_3 : !transform.any_op
+    
+    transform.yield 
+  }
+} // module

diff --git a/tests/transform_dialect/cuda/vecadd2d.mlir b/tests/transform_dialect/cuda/vecadd2d.mlir
deleted file mode 100644
index 7e03154..0000000
--- a/tests/transform_dialect/cuda/vecadd2d.mlir
+++ /dev/null

@@ -1,84 +0,0 @@
-!type = tensor<9x512xf32>
-!type2 = tensor<512x9xf32>
-
-#trait = { indexing_maps  = [affine_map<(d0, d1) -> (d0, d1)>],
-           iterator_types = ["parallel", "parallel"] }
-
-#trait2 = { indexing_maps  = [affine_map<(d0, d1) -> (d0, d1)>,
-                              affine_map<(d0, d1) -> (d1, d0)>],
-           iterator_types = ["parallel", "parallel"] }
-
-util.global private @"lhs" {noinline} = dense<0.0> : !type2
-util.global private @"rhs" {noinline} = dense<2.0> : !type
-
-func.func @vecadd2d() -> (!type2) {
-  %cst0 = arith.constant 0.000000e+00 : f32
-  %cst1 = arith.constant 2.000000e+00 : f32
-
-  %x_ptr = util.global.address @"rhs" : !util.ptr<!type>
-  %x = util.global.load.indirect %x_ptr : !util.ptr<!type> -> !type
-  %y_ptr = util.global.address @"lhs" : !util.ptr<!type2>
-  %y = util.global.load.indirect %y_ptr : !util.ptr<!type2> -> !type2
-
-  // Note: Two linalg.generics to fill the tensors will make IREE generate two
-  // separate kernels for the above and the below. It is important to validate
-  // the results.
-  %2 = linalg.generic #trait2 ins(%x : !type) outs(%y : !type2) {
-  ^bb0(%arg3: f32, %arg4: f32):
-    %3 = arith.addf %arg3, %arg4 : f32
-    linalg.yield %3 : f32
-  } -> !type2
-
-  return %2 : !type2
-}
-
-// RUN: iree-opt %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-abi-transformation-pipeline \
-// RUN:     --iree-flow-transformation-pipeline  \
-// RUN:     --iree-stream-transformation-pipeline \
-// RUN:     --iree-hal-configuration-pipeline | \
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-lower-executable-target)))' \
-// RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=%p/vecadd2d_codegen_spec.mlir | \
-// RUN: FileCheck %s --check-prefix=CHECK
-
-// RUN: iree-opt %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-abi-transformation-pipeline \
-// RUN:     --iree-flow-transformation-pipeline  \
-// RUN:     --iree-stream-transformation-pipeline \
-// RUN:     --iree-hal-configuration-pipeline | \
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-lower-executable-target)))' \
-// RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=%p/vecadd2d_codegen_spec_partial_tile.mlir | \
-// RUN: FileCheck %s --check-prefix=CHECK-PARTIAL-TILE
-
-// RUN: iree-compile %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-opt-const-expr-hoisting=false --iree-opt-const-eval=false \
-/// Constant JIT'ing must be disabled because the transform-dialect debug
-/// flags leak to the JIT session, which doesn't know what to do with them.
-// RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=%p/vecadd2d_codegen_spec.mlir | \
-// RUN: iree-run-module --module=- --function=vecadd2d --device=cuda |\
-// RUN: FileCheck %s --check-prefix=EXEC
-
-//     CHECK:  hal.executable.export
-//     CHECK:  bb0(%[[DEV:.*]]: !hal.device):
-//     CHECK:  %[[C171:.*]] = arith.constant 171 : index
-//     CHECK:  %[[C1:.*]] = arith.constant 1 : index
-//     CHECK:  %[[C2:.*]] = arith.constant 2 : index
-//     CHECK:  hal.return %[[C171]], %[[C1]], %[[C2]] : index, index, index
-
-//     CHECK:  %[[BLKZ:.*]] = hal.interface.workgroup.id[2] : index
-//     CHECK:  %[[BLKX:.*]] = hal.interface.workgroup.id[0] : index
-//     CHECK:  memref.subview %0[%[[BLKZ:.*]], %[[BLKX:.*]]]
-
-//     CHECK-PARTIAL-TILE:  hal.executable.export
-//     CHECK-PARTIAL-TILE:  bb0(%[[DEV:.*]]: !hal.device):
-//     CHECK-PARTIAL-TILE:  %[[C1:.*]] = arith.constant 1 : index
-//     CHECK-PARTIAL-TILE:  %[[C1_2:.*]] = arith.constant 1 : index
-//     CHECK-PARTIAL-TILE:  %[[C171:.*]] = arith.constant 171 : index
-//     CHECK-PARTIAL-TILE:  hal.return %[[C1]], %[[C1_2]], %[[C171]] : index, index, index
-
-//      EXEC: EXEC @vecadd2d
-//      EXEC: result[0]: hal.buffer_view
-//      EXEC: 512x9xf32=[2 2 2 2 2 2 2 2 2][2 2 2 2 2 2 2 2 2]

diff --git a/tests/transform_dialect/cuda/vecadd2d_codegen_spec.mlir b/tests/transform_dialect/cuda/vecadd2d_codegen_spec.mlir
deleted file mode 100644
index 2f94296..0000000
--- a/tests/transform_dialect/cuda/vecadd2d_codegen_spec.mlir
+++ /dev/null

@@ -1,27 +0,0 @@
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
-  // Step 1. Find three linalg.generics and tile to GPU thread blocks.
-  // ===========================================================================
-  %generics = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %_, %forall_grid = transform.structured.tile_using_forall %generics 
-                  tile_sizes [5, 3] ( mapping = [#gpu.block<z>, #gpu.block<x>])
-                  : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
-
-
-  // Step 2. Rank reduce and bufferize and drop HAL decriptor from memref ops.
-  // ===========================================================================
-  %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %func {
-    transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-    transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-    transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-  } : !transform.any_op
-  transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
-  %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> !transform.any_op
-  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-
-  // Step 3. Map to GPU thread blocks.
-  // ===========================================================================
-  transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> ()
-}

diff --git a/tests/transform_dialect/cuda/vecadd2d_codegen_spec_partial_tile.mlir b/tests/transform_dialect/cuda/vecadd2d_codegen_spec_partial_tile.mlir
deleted file mode 100644
index fc373cb..0000000
--- a/tests/transform_dialect/cuda/vecadd2d_codegen_spec_partial_tile.mlir
+++ /dev/null

@@ -1,24 +0,0 @@
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
-  %generics = transform.structured.match ops{["linalg.generic"]} in %variant_op 
-    : (!transform.any_op) -> !transform.any_op
-  // Tile only one dimension, skip the other one.
-  %_, %forall_grid = transform.structured.tile_using_forall %generics 
-                  tile_sizes [0, 3] ( mapping = [#gpu.block<z>])
-                   : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
-
-
-  // Late canonicalizations to cleanup and pass the checks.
-  // Needs to occur on the whole variant to perform cse on the workgroup_count region
-  %func_op = transform.structured.match ops{["func.func"]} in %variant_op
-      : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %func_op {
-    transform.apply_patterns.iree.fold_fill_into_pad
-    transform.apply_patterns.linalg.tiling_canonicalization
-    transform.apply_patterns.scf.for_loop_canonicalization
-    transform.apply_patterns.canonicalization
-  } : !transform.any_op
-  transform.iree.apply_licm %func_op : !transform.any_op
-  transform.iree.apply_cse %func_op : !transform.any_op
-}
commit	c0525ad9ec20e286b0145b797122464df8668d43	[log] [tgz]
author	Nicolas Vasilache <nicolasvasilache@users.noreply.github.com>	Wed Nov 01 11:37:42 2023 +0100
committer	GitHub <noreply@github.com>	Wed Nov 01 10:37:42 2023 +0000
tree	66d17378a47b2790dd6eed32dc5d3723f7c1eed5
parent	31125763d93b45f714ababe689bdd39cf1e02fbc [diff]