[Codegen] Re-Enable transform dialect configuration strategy round 2 (#16427)

This time it just drops all transform dialect usage outside of transform
library file path + entry point name. This reduces code complexity in
`MaterializeUserConfigs`.

Also cleans up some of the transform dialect tests to stop lit testing
at the same time. We might want to consider dropping some of them as
they aren't being maintained (the only thing they verify at the moment
is that the transform scripts are valid for CUDA).
diff --git a/compiler/src/iree/compiler/Codegen/Common/MaterializeUserConfigs.cpp b/compiler/src/iree/compiler/Codegen/Common/MaterializeUserConfigs.cpp
index 038d3fa..1a918fe 100644
--- a/compiler/src/iree/compiler/Codegen/Common/MaterializeUserConfigs.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/MaterializeUserConfigs.cpp
@@ -25,25 +25,47 @@
 
 namespace mlir::iree_compiler {
 
-llvm::cl::opt<std::string> clCodegenTransformDialectStrategyName(
-    "iree-codegen-use-transform-dialect-strategy",
-    llvm::cl::desc(
-        "Broadcasts the given transform dialect strategy specification to all "
-        "dispatches. The specification is a symbol reference to load from a"
-        "library of transform specs (@library_call)"),
-    llvm::cl::init(""));
-
 llvm::cl::opt<std::string> clCodegenTransformDialectLibraryFileName(
     "iree-codegen-transform-dialect-library",
     llvm::cl::desc(
         "File path to a module containing a library of transform dialect"
-        "strategies"),
+        "strategies. Can be suffixed with the name of a transform sequence"
+        "within the library to run as preprocessing per executable variant."
+        "This is specified as <file-path>@<sequence-name>. If not specified,"
+        "this will default to `__kernel_config`."),
     llvm::cl::init(""));
 
 namespace {
 
 static const char kTranslationInfoAttrName[] = "translation_info";
 
+enum StrategyRunResult {
+  Success = 0,
+  NotFound = 1,
+  Failed = 2,
+};
+
+static StrategyRunResult
+runTransformConfigurationStrategy(Operation *payloadRoot,
+                                  StringRef entryPointName,
+                                  ModuleOp &transformLibrary) {
+  /// If we have a symbol, verify the existence of the symbol within the
+  /// transform library.
+  Operation *entryPoint = transform::detail::findTransformEntryPoint(
+      payloadRoot, transformLibrary, entryPointName);
+  if (!entryPoint) {
+    return StrategyRunResult::NotFound;
+  }
+
+  transform::TransformOptions options;
+  if (failed(transform::applyTransformNamedSequence(
+          payloadRoot, entryPoint, transformLibrary,
+          options.enableExpensiveChecks(true)))) {
+    return StrategyRunResult::Failed;
+  }
+  return StrategyRunResult::Success;
+}
+
 struct MaterializeUserConfigsPass
     : public MaterializeUserConfigsBase<MaterializeUserConfigsPass> {
   void getDependentDialects(DialectRegistry &registry) const override {
@@ -57,42 +79,73 @@
         getAllEntryPoints(moduleOp);
     MLIRContext *context = moduleOp.getContext();
 
+    // Parse the file path and kernel config strategy from flags. There are
+    // two possible usage flows for transform dialect libraries.
+    //   1. Use `__kernel_config` to match and annotate variants with the
+    //      strategy to use. This could either be a transform dialect strategy
+    //      or any other IREE codegen pipeline.
+    //
+    //   2. Use the configuration strategy to do codegen directly. At the end of
+    //      the strategy, the variant needs to be annotated with
+    //      "translation_info" = #iree_codegen.translation_info<None>
+    SmallVector<StringRef, 2> parts;
+    llvm::SplitString(llvm::StringRef(clCodegenTransformDialectLibraryFileName),
+                      parts, "@");
+    if (parts.size() > 2) {
+      variantOp.emitError()
+          << "Invalid transform library path and sequence name "
+          << clCodegenTransformDialectLibraryFileName;
+      return signalPassFailure();
+    }
+    bool hasTransformLibrary = !parts.empty();
+
+    std::string libraryFileName;
+    if (hasTransformLibrary) {
+      if (parts[0].empty()) {
+        variantOp.emitError() << "Cannot specify an empty library path";
+        return signalPassFailure();
+      }
+      libraryFileName = parts[0];
+    }
+
+    std::string entrySequenceName;
+    // Check if the user specified a custom entry point name.
+    if (parts.size() == 2) {
+      if (parts[1].empty()) {
+        variantOp.emitError() << "Cannot specify an empty sequence name";
+        return signalPassFailure();
+      }
+      entrySequenceName = parts[1];
+    } else {
+      entrySequenceName = "__kernel_config";
+    }
+
     LDBG("MaterializeUserConfigsPass on variant: " << variantOp);
     std::optional<ModuleOp> transformLibrary = std::nullopt;
-    if (!clCodegenTransformDialectLibraryFileName.empty()) {
+    if (hasTransformLibrary) {
       auto dialect =
           context->getOrLoadDialect<IREE::Codegen::IREECodegenDialect>();
-      auto maybeTransformLibrary = dialect->getOrLoadTransformLibraryModule(
-          clCodegenTransformDialectLibraryFileName);
+      auto maybeTransformLibrary =
+          dialect->getOrLoadTransformLibraryModule(libraryFileName);
       if (failed(maybeTransformLibrary)) {
-        variantOp.emitError() << "failed to load transform library module: "
-                              << clCodegenTransformDialectLibraryFileName;
+        variantOp.emitError()
+            << "failed to load transform library module: " << libraryFileName;
         return signalPassFailure();
       }
       transformLibrary = *maybeTransformLibrary;
-      LDBG("--found transform library @"
-           << clCodegenTransformDialectLibraryFileName);
-    }
+      LDBG("--found transform library @" << libraryFileName);
 
-    IREE::Codegen::DispatchLoweringPassPipeline tdPipeline =
-        IREE::Codegen::DispatchLoweringPassPipeline::TransformDialectCodegen;
-    std::optional<IREE::Codegen::TranslationInfoAttr> clTranslationInfo;
-    // Here we always set the pipeline strategy to transform dialect if the
-    // flag is non-empty to ensure we pick the right lowering pipeline in the
-    // event a strategy symbol is defined.
-    if (!clCodegenTransformDialectLibraryFileName.empty() ||
-        !clCodegenTransformDialectStrategyName.empty()) {
-      StringRef strategyName =
-          (clCodegenTransformDialectStrategyName.empty())
-              ? StringRef(
-                    transform::TransformDialect::kTransformEntryPointSymbolName)
-              : clCodegenTransformDialectStrategyName;
-      clTranslationInfo = IREE::Codegen::TranslationInfoAttr::get(
-          context, tdPipeline,
-          /*codegenSpec=*/
-          SymbolRefAttr::get(context, llvm::StringRef(strategyName)),
-          /*configuration=*/DictionaryAttr());
-      LDBG("--clTranslationInfo: " << clTranslationInfo);
+      auto runResult = runTransformConfigurationStrategy(
+          variantOp, entrySequenceName, *transformLibrary);
+      if (runResult == StrategyRunResult::NotFound) {
+        variantOp.emitError() << "transform kernel config strategy `"
+                              << entrySequenceName << " not found";
+        return signalPassFailure();
+      } else if (runResult == StrategyRunResult::Failed) {
+        variantOp.emitError() << "transform kernel config strategy `"
+                              << entrySequenceName << "` failed to apply";
+        return signalPassFailure();
+      }
     }
 
     LDBG("--start iterating over: "
@@ -106,6 +159,11 @@
         continue;
       }
 
+      /// Nothing to do if the export already has a config.
+      if (getTranslationInfo(exportOp)) {
+        continue;
+      }
+
       /// First, apply all user configs.
       auto res = funcOp.walk([&](Operation *op) {
         if (auto compilationInfo = getCompilationInfo(op)) {
@@ -120,48 +178,14 @@
         moduleOp.emitOpError("error in setting user configuration");
         return signalPassFailure();
       }
-
-      /// Let user configs take priority over the global strategy flag.
-      if (IREE::Codegen::TranslationInfoAttr exportedTranslationInfo =
-              getTranslationInfo(exportOp)) {
-        if (translationInfo) {
-          /// Currently codegen is rooted on the variant, meaning every entry
-          /// must go through the same codegen pipeline. For multi-targeting we
-          /// will want to have multiple functions per variant, as well as
-          /// multiple exports per variant, meaning eventually the nesting of
-          /// the translation pipeline will need to change to the function, or
-          /// we'll need another level of module op nesting.
-          if (exportedTranslationInfo != translationInfo.value()) {
-            moduleOp.emitOpError(
-                "unhandled compilation of entry point functions with different "
-                "translation info");
-            return signalPassFailure();
-          }
-        } else {
-          translationInfo = exportedTranslationInfo;
-        }
-      } else {
-        if (translationInfo && translationInfo != clTranslationInfo) {
-          moduleOp.emitOpError(
-              "unhandled compilation of entry point functions with translation "
-              "info optionality");
-          return signalPassFailure();
-        }
-        if (clTranslationInfo) {
-          translationInfo = clTranslationInfo;
-          if (failed(setTranslationInfo(funcOp, translationInfo.value()))) {
-            moduleOp.emitOpError("failed to set command line translation info");
-            return signalPassFailure();
-          }
-        }
-      }
     }
 
     LDBG("--guaranteed unique translationInfo: " << translationInfo);
     /// We only need to resolve symbols for transform dialect based strategies.
     if (!translationInfo ||
         translationInfo.value().getDispatchLoweringPassPipeline() !=
-            tdPipeline) {
+            IREE::Codegen::DispatchLoweringPassPipeline::
+                TransformDialectCodegen) {
       return;
     }
 
diff --git a/compiler/src/iree/compiler/Codegen/Common/Passes.h b/compiler/src/iree/compiler/Codegen/Common/Passes.h
index e67f46e..48cfad4 100644
--- a/compiler/src/iree/compiler/Codegen/Common/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/Common/Passes.h
@@ -267,7 +267,8 @@
 
 /// Create an IREE-specific Transform dialect interpreter pass with all
 /// registrations necessary for IREE.
-std::unique_ptr<Pass> createTransformDialectInterpreterPass();
+std::unique_ptr<Pass>
+createTransformDialectInterpreterPass(StringRef transformSequenceName = "");
 
 /// Pass to propagate type to avoid generating load/stores of illegal types.
 std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
diff --git a/compiler/src/iree/compiler/Codegen/Common/TransformDialectInterpreterPass.cpp b/compiler/src/iree/compiler/Codegen/Common/TransformDialectInterpreterPass.cpp
index 4852485..a5b4ca2 100644
--- a/compiler/src/iree/compiler/Codegen/Common/TransformDialectInterpreterPass.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/TransformDialectInterpreterPass.cpp
@@ -72,13 +72,19 @@
 
 namespace mlir::iree_compiler {
 
-extern llvm::cl::opt<std::string> clCodegenTransformDialectStrategyName;
 extern llvm::cl::opt<std::string> clCodegenTransformDialectLibraryFileName;
 
 /// Create a Transform dialect interpreter pass.
-std::unique_ptr<Pass> createTransformDialectInterpreterPass() {
+std::unique_ptr<Pass>
+createTransformDialectInterpreterPass(StringRef transformSequenceName) {
+  StringRef libraryPath = "";
+  SmallVector<StringRef, 2> parts;
+  llvm::SplitString(llvm::StringRef(clCodegenTransformDialectLibraryFileName),
+                    parts, "@");
+  if (!parts.empty()) {
+    libraryPath = parts[0];
+  }
   return std::make_unique<TransformDialectInterpreterPass>(
-      clCodegenTransformDialectLibraryFileName,
-      clCodegenTransformDialectStrategyName);
+      libraryPath, transformSequenceName);
 }
 } // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
index 009305c..9a9cd79 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
@@ -186,9 +186,12 @@
     break;
   }
   // Transform-dialect pipelines.
-  case IREE::Codegen::DispatchLoweringPassPipeline::TransformDialectCodegen:
-    addTransformDialectPasses(pipeline);
+  case IREE::Codegen::DispatchLoweringPassPipeline::TransformDialectCodegen: {
+    SymbolRefAttr codegenSpec = translationInfo.value().getCodegenSpec();
+    addTransformDialectPasses(
+        pipeline, codegenSpec ? codegenSpec.getLeafReference() : StringRef(""));
     break;
+  }
   default:
     moduleOp.emitOpError("Unsupported pipeline on CPU target.");
     return signalPassFailure();
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
index 2f05877..a467ee2 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
@@ -582,10 +582,11 @@
   addCPUBufferizePasses(nestedModulePM);
 }
 
-void addTransformDialectPasses(OpPassManager &passManager) {
+void addTransformDialectPasses(OpPassManager &passManager,
+                               StringRef entryPoint) {
   // Give control to the transform dialect.
   passManager.addPass(
-      mlir::iree_compiler::createTransformDialectInterpreterPass());
+      mlir::iree_compiler::createTransformDialectInterpreterPass(entryPoint));
   // Dropping the schedule is needed:
   //   1. if we want to embed the transform in the module: we should drop the
   //      schedule once applied.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.h b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.h
index 91a1cf6..d9b7efa 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.h
@@ -169,7 +169,8 @@
                                     bool lowerToVectors = true);
 
 /// Transform dialect-based common.
-void addTransformDialectPasses(OpPassManager &passManager);
+void addTransformDialectPasses(OpPassManager &passManager,
+                               StringRef entryPoint);
 
 // Populates the passes needed to do tiling, decomposing, and vectorizing the
 // convolution ops.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp
index a89102c..084d1cc 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp
@@ -128,9 +128,12 @@
     addGPUPackUnPackPasses(pipeline);
     break;
   // Transform-dialect pipelines.
-  case IREE::Codegen::DispatchLoweringPassPipeline::TransformDialectCodegen:
-    addGPUTransformDialectPasses(pipeline);
+  case IREE::Codegen::DispatchLoweringPassPipeline::TransformDialectCodegen: {
+    SymbolRefAttr codegenSpec = translationInfo.value().getCodegenSpec();
+    addGPUTransformDialectPasses(
+        pipeline, codegenSpec ? codegenSpec.getLeafReference() : StringRef(""));
     break;
+  }
   // no pipeline specified, nothing to do.
   case IREE::Codegen::DispatchLoweringPassPipeline::None:
     return;
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
index 77c0885..c76d7fc 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -740,9 +740,10 @@
 extern llvm::cl::opt<std::string> clGPUCodegenTransformDialectDebugPayloadTag;
 extern llvm::cl::opt<std::string> clGPUCodegenTransformDialectDebugTransformTag;
 
-void addGPUTransformDialectPasses(OpPassManager &passManager) {
+void addGPUTransformDialectPasses(OpPassManager &passManager,
+                                  StringRef entryPoint) {
   passManager.addPass(
-      mlir::iree_compiler::createTransformDialectInterpreterPass());
+      mlir::iree_compiler::createTransformDialectInterpreterPass(entryPoint));
 
   // Dropping the schedule is needed:
   //   1. if we want to embed the transform in the module: we should drop the
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
index 40dce21..d1bed4b 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
@@ -40,7 +40,7 @@
 void addGPUSimpleDistributePassPipeline(OpPassManager &pm);
 
 /// Transform dialect-based path.
-void addGPUTransformDialectPasses(OpPassManager &pm);
+void addGPUTransformDialectPasses(OpPassManager &pm, StringRef entryPoint);
 
 /// Lowering transpose using shared memory.
 void addGPUTransposePassPipeline(OpPassManager &pm);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
index 69b56cb..d50b8e5 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
@@ -52,12 +52,13 @@
             "pack_shared_memory_alloc.mlir",
             "tensor_pad.mlir",
             "tensorcore_vectorization.mlir",
-            "transform_dialect_hoist_allocs.mlir",
-            "transform_dialect_vector_distribution.mlir",
             "transform_dialect_bufferize.mlir",
             "transform_dialect_eliminate_gpu_barriers.mlir",
+            "transform_dialect_hoist_allocs.mlir",
             "transform_dialect_pack_shared_memory_alloc.mlir",
             "transform_dialect_promote_operands.mlir",
+            "transform_dialect_vector_distribution.mlir",
+            "transform_dialect_vector_to_nvgpu_mma.mlir",
             "transform_distribute_forall.mlir",
             "transform_gpu_pipelining.mlir",
             "transform_vector_to_mma.mlir",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
index c6e9113..7cbffbc 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
@@ -54,6 +54,7 @@
     "transform_dialect_pack_shared_memory_alloc.mlir"
     "transform_dialect_promote_operands.mlir"
     "transform_dialect_vector_distribution.mlir"
+    "transform_dialect_vector_to_nvgpu_mma.mlir"
     "transform_distribute_forall.mlir"
     "transform_gpu_pipelining.mlir"
     "transform_vector_to_mma.mlir"
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/linalg_transform.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/linalg_transform.mlir
index 7b11993..2a59b35 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/linalg_transform.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/linalg_transform.mlir
@@ -1,11 +1,11 @@
 // RUN: iree-opt %s  --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-codegen-llvmgpu-configuration-pipeline, iree-llvmgpu-lower-executable-target)))" \
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-transform-dialect-library=%p/transform_dialect_codegen_bufferize_spec.mlir | \
+// RUN:     --iree-codegen-transform-dialect-library=%p/transform_dialect_codegen_bufferize_spec.mlir@__transform_main | \
 // RUN: FileCheck %s
 
 // RUN: iree-opt %s  --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-codegen-llvmgpu-configuration-pipeline, iree-llvmgpu-lower-executable-target)))" \
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-transform-dialect-library=%p/transform_dialect_codegen_foreach_to_gpu_spec.mlir | \
+// RUN:     --iree-codegen-transform-dialect-library=%p/transform_dialect_codegen_foreach_to_gpu_spec.mlir@__transform_main | \
 // RUN: FileCheck %s --check-prefix=FOREACH-TO-GPU
 
 #device_target_cuda = #hal.device.target<"cuda", {executable_targets = [#hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_60"}>]}>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_bufferize_spec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_bufferize_spec.mlir
index 472b508..3a47a94 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_bufferize_spec.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_bufferize_spec.mlir
@@ -4,6 +4,11 @@
     transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
     %variant_op_3 = transform.iree.bufferize %variant_op : (!transform.any_op) -> !transform.any_op
     %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+
+    // Annotate the exported function as already translated.
+    %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
+    transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
     transform.yield
   }
 } // module
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_foreach_to_gpu_spec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_foreach_to_gpu_spec.mlir
index d8a1a99..b15fe98 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_foreach_to_gpu_spec.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_foreach_to_gpu_spec.mlir
@@ -40,6 +40,11 @@
     } : !transform.any_op
     transform.iree.apply_licm %memref_func : !transform.any_op
     transform.apply_cse to %memref_func : !transform.any_op
+
+    // Annotate the exported function as already translated.
+    %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
+    transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
     transform.yield
   }
 } // module
diff --git a/tests/transform_dialect/cuda/mma.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_vector_to_nvgpu_mma.mlir
similarity index 100%
rename from tests/transform_dialect/cuda/mma.mlir
rename to compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_vector_to_nvgpu_mma.mlir
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/Passes.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/Passes.cpp
index a0b043f..7018f13 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/Passes.cpp
@@ -264,9 +264,10 @@
   spirvPM.addPass(spirv::createSPIRVUpdateVCEPass());
 }
 
-void addSPIRVTransformDialectPasses(OpPassManager &passManager) {
+void addSPIRVTransformDialectPasses(OpPassManager &passManager,
+                                    StringRef entryPoint) {
   passManager.addPass(
-      mlir::iree_compiler::createTransformDialectInterpreterPass());
+      mlir::iree_compiler::createTransformDialectInterpreterPass(entryPoint));
 
   // Dropping the schedule is needed:
   //   1. if we want to embed the transform in the module: we should drop the
@@ -647,8 +648,9 @@
   nestedModulePM.addPass(createCSEPass());
 }
 
-void addSPIRVTransformDialectPassPipeline(OpPassManager &pm) {
-  addSPIRVTransformDialectPasses(pm);
+void addSPIRVTransformDialectPassPipeline(OpPassManager &pm,
+                                          StringRef entryPoint) {
+  addSPIRVTransformDialectPasses(pm, entryPoint);
 
   // Run GenericVectorization pass additionally to convert vectors into forms
   // needed for SPIR-V.
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/Passes.h b/compiler/src/iree/compiler/Codegen/SPIRV/Passes.h
index ac91fe2..e9e0d40 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/Passes.h
@@ -44,7 +44,8 @@
 void addSPIRVSubgroupReducePassPipeline(OpPassManager &pm);
 
 /// Pass pipeline to lower IREE HAL executables via transform dialect schedules.
-void addSPIRVTransformDialectPassPipeline(OpPassManager &pm);
+void addSPIRVTransformDialectPassPipeline(OpPassManager &pm,
+                                          StringRef entryPoint);
 
 /// Pass pipeline to lower winograd ops. This pipeline follows the
 /// SPIRVBaseVectorize pipeline with the following exception:
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVLowerExecutableTargetPass.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVLowerExecutableTargetPass.cpp
index 721d902..ded7a8a 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVLowerExecutableTargetPass.cpp
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVLowerExecutableTargetPass.cpp
@@ -113,9 +113,12 @@
   case CodeGenPipeline::SPIRVWinogradVectorize:
     addSPIRVWinogradVectorizePassPipeline(pipeline);
     break;
-  case CodeGenPipeline::TransformDialectCodegen:
-    addSPIRVTransformDialectPassPipeline(pipeline);
+  case CodeGenPipeline::TransformDialectCodegen: {
+    SymbolRefAttr codegenSpec = translationInfo.value().getCodegenSpec();
+    addSPIRVTransformDialectPassPipeline(
+        pipeline, codegenSpec ? codegenSpec.getLeafReference() : StringRef(""));
     break;
+  }
   // No pipeline specified, nothing to do.
   case CodeGenPipeline::None:
     return;
diff --git a/samples/transform_dialect/example_module.mlir b/samples/transform_dialect/example_module.mlir
index c5eab46..1e4ac4e 100644
--- a/samples/transform_dialect/example_module.mlir
+++ b/samples/transform_dialect/example_module.mlir
@@ -107,28 +107,29 @@
 }
 
 /// We test first with threading off so that the printers are legible.
-// R-UN: iree-compile %s --iree-hal-target-backends=vulkan \
-// R-UN:   --iree-codegen-use-transform-dialect-strategy=transform_main \
-// R-UN:   --iree-codegen-transform-dialect-library=%p/transform_library.mlir \
-// R-UN:   --compile-from=executable-sources \
-// R-UN:   --compile-to=executable-targets \
-// R-UN:   --mlir-disable-threading | \
-// R-UN: FileCheck %s --check-prefixes=CODEGEN-PRINTER
+// RUN: iree-compile %s --iree-hal-target-backends=vulkan \
+// RUN:   --iree-codegen-transform-dialect-library=%p/transform_library.mlir@kernel_config \
+// RUN:   --compile-from=executable-sources \
+// RUN:   --compile-to=executable-targets \
+// RUN:   --mlir-disable-threading | \
+// RUN: FileCheck %s --check-prefixes=CODEGEN-PRINTER
 
-// CODEGEN-PRINTER:     IR printer: Setting matmul strategy to default top-level
-// CODEGEN-PRINTER:       translation_info = #iree_codegen.translation_info<TransformDialectCodegen codegen_spec = @transform_main
+// CODEGEN-PRINTER:     IR printer: Setting matmul strategy to custom_transform_strategy
+// CODEGEN-PRINTER:       translation_info = #iree_codegen.translation_info<TransformDialectCodegen codegen_spec = @custom_transform_strategy>
 // CODEGEN-PRINTER:     IR printer: Setting reduce strategy to base vectorize top-level
 // CODEGEN-PRINTER:       translation_info = #iree_codegen.translation_info<SPIRVBaseVectorize>, workgroup_size = [16 : index, 1 : index, 1 : index]
 
 /// Then test with threading to make sure it runs
 // RUN: iree-compile %s --iree-hal-target-backends=vulkan \
-// RUN:   --iree-codegen-use-transform-dialect-strategy=@transform_main \
-// RUN:   --iree-codegen-transform-dialect-library=%p/transform_library.mlir \
+// RUN:   --iree-codegen-transform-dialect-library=%p/transform_library.mlir@kernel_config \
 // RUN:   --compile-from=executable-sources \
 // RUN:   --compile-to=executable-targets \
 // RUN:   --mlir-disable-threading | \
 // RUN: FileCheck %s --check-prefixes=CODEGEN
 
+// CODEGEN: Ran custom_transform_strategy
 // CODEGEN: spirv.func @example_module_dispatch_0_generic_80_f32
-// CODEGEN: spirv.func @example_module_dispatch_1_matmul_16x16x5_f32
+// CODEGEN: hal.executable private @example_module_dispatch_1
+// CODEGEN:   #iree_codegen.translation_info<TransformDialectCodegen codegen_spec = @custom_transform_strategy>
+// CODEGEN:     spirv.func @example_module_dispatch_1_matmul_16x16x5_f32
 // CODEGEN: spirv.func @example_module_dispatch_2_generic_16x16_f32
diff --git a/samples/transform_dialect/transform_library.mlir b/samples/transform_dialect/transform_library.mlir
index 3bb75ad..8b17af7 100644
--- a/samples/transform_dialect/transform_library.mlir
+++ b/samples/transform_dialect/transform_library.mlir
@@ -1,13 +1,76 @@
 module attributes { transform.with_named_sequence } {
-  // Print and send it down normal IREE codegen.
-  transform.named_sequence @custom_matmul(%matmul: !transform.any_op {transform.consumed}) {  
-    %1 = transform.structured.generalize %matmul : (!transform.any_op) -> !transform.any_op
-    transform.print {name = "Setting matmul strategy to default"}
+  // Example of a custom matmul strategy. The target matmul is annotated with
+  // the name of this strategy down below before strategy selection, overriding
+  // default IREE codegen.
+  transform.named_sequence @custom_transform_strategy(
+      %variant_op: !transform.any_op {transform.consumed}) {
+    // Step 1. Re-match the matmul
+    // ===========================================================================
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+
+    // Step 2. Tile to grid
+    // ===========================================================================
+    %grid_reduction, %forall_grid =
+    transform.structured.tile_using_forall %matmul tile_sizes [16, 16] ( mapping = [#gpu.block<x>, #gpu.block<y>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
+
+    // Step 3. Vectorize
+    // ===========================================================================
+    %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+      transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+      transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+    } : !transform.any_op
+    %func_1 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
+
+    // Step 4. Bufferize
+    // ===========================================================================
+    transform.apply_patterns to %func_1 {
+      transform.apply_patterns.iree.fold_fill_into_pad
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.scf.for_loop_canonicalization
+    } : !transform.any_op
+    transform.apply_patterns to %func_1 {
+      transform.apply_patterns.tensor.reassociative_reshape_folding
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.apply_cse to %func_1 : !transform.any_op
+    transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+    transform.apply_patterns to %func_1 {
+      transform.apply_patterns.linalg.erase_unnecessary_inputs
+    } : !transform.any_op
+    %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
+    %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+
+    // Step 6. Post-bufferization vector distribution
+    // ===========================================================================
+    %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
+    transform.iree.map_nested_forall_to_gpu_threads %func_7
+        workgroup_dims = [4, 8, 1] : (!transform.any_op) -> ()
+
+    // Step 7. Do layout analysis and lower to mma
+    // ===========================================================================
+    %func_10 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %func_11 = transform.iree.layout_analysis_and_distribution %func_10 : (!transform.any_op) -> (!transform.any_op)
+    transform.print {name = "Ran custom_transform_strategy"}
     transform.yield
   }
 
-  // Send it down subgroup reduce.
-  transform.named_sequence @use_subgroup_reduce(%reduce: !transform.any_op {transform.readonly}) {  
+  // Send it down a custom transform dialect pipeline.
+  transform.named_sequence @custom_matmul(%matmul: !transform.any_op {transform.readonly}) {
+    %variant_op = transform.get_parent_op %matmul {op_name = "hal.executable.variant"} : (!transform.any_op) -> !transform.any_op
+    %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    %subgroup_reduce = transform.param.constant #iree_codegen.translation_info<TransformDialectCodegen
+                                                                               codegen_spec = @custom_transform_strategy> -> !transform.any_param
+    transform.annotate %exports "translation_info" = %subgroup_reduce : !transform.any_op, !transform.any_param
+    transform.print {name = "Setting matmul strategy to custom_transform_strategy"}
+    transform.yield
+  }
+
+  // Send it down subgroup reduce with a custom tiling configuration.
+  transform.named_sequence @use_base_vectorize(%reduce: !transform.any_op {transform.readonly}) {
     %variant_op = transform.get_parent_op %reduce {op_name = "hal.executable.variant"} : (!transform.any_op) -> !transform.any_op
     %lowering_config = transform.param.constant #iree_codegen.lowering_config<tile_sizes = [[8, 0], [1, 0], [0, 0, 4]]> -> !transform.any_param
     transform.annotate %reduce "lowering_config" = %lowering_config : !transform.any_op, !transform.any_param
@@ -42,10 +105,34 @@
     transform.yield %matched : !transform.any_op
   }
 
-  transform.named_sequence @transform_main(%variant_op: !transform.any_op {transform.consumed}) {  
+  // An example of a custom transform dialect based kernel config. Note that
+  // because of the way `transform.foreach_match` works, the callback cannot
+  // manipulate IR beyond the op *given* to the matcher, as foreach_match will
+  // attempt to keep walking the IR even after a successful match. The expected
+  // flow for a strategy like this is as follows:
+  //
+  // Author an entry point like this (@kernel_config) that walks the IR and
+  // attempts to annotate the dispatch with the codegen strategy to use, i.e.
+  //   transform.foreach_match in %variant_op
+  //       @matcher_0 -> @annotator_0,
+  //       @matcher_1 -> @annotator_1,
+  //       ...
+  //
+  // the annotators should attach an #iree_codegen.translation_info attribute
+  // to the `hal.executable.export` ops within the variant as well as any
+  // relevant op specific tile sizes (and other important attributes like
+  // workgroup_size and subgroup_size, if relevant). This will then get handed
+  // off to backend specific kernel config, which will let these user configs
+  // pass through unperturbed.
+  //
+  // To couple this with a transform dialect based codegen strategy, the target
+  // codegen strategy can be included inline with this library and relevant ops
+  // can be annotated with `TransformDialectCodegen` as the lowering pipeline,
+  // with a reference to the strategy to use (see an example above).
+  transform.named_sequence @kernel_config(%variant_op: !transform.any_op {transform.consumed}) {
     transform.foreach_match in %variant_op
         @match_matmul -> @custom_matmul,
-        @match_reduce -> @use_subgroup_reduce
+        @match_reduce -> @use_base_vectorize
       : (!transform.any_op) -> (!transform.any_op)
     transform.yield
   }
diff --git a/tests/e2e/linalg_transform/BUILD.bazel b/tests/e2e/linalg_transform/BUILD.bazel
deleted file mode 100644
index 9b5ffa4..0000000
--- a/tests/e2e/linalg_transform/BUILD.bazel
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright 2022 The IREE Authors
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
-
-package(
-    features = ["layering_check"],
-    licenses = ["notice"],  # Apache 2.0
-)
-
-iree_lit_test_suite(
-    name = "check_linalg_transform",
-    srcs = ["linalg_transform.mlir"],
-    cfg = "//tests:lit.cfg.py",
-    # transform_dialect_xxx_spec are MLIR files that specify a transformation,
-    # they need to be included as data.
-    data = [
-        "//tests/e2e/linalg_transform:transform_dialect_codegen_spec.mlir",
-        "//tests/e2e/linalg_transform:transform_dialect_dispatch_spec.mlir",
-    ],
-    tags = [
-        "hostonly",
-    ],
-    tools = [
-        "//tools:iree-opt",
-        "//tools:iree-run-mlir",
-        "@llvm-project//lld",
-        "@llvm-project//llvm:FileCheck",
-    ],
-)
diff --git a/tests/e2e/linalg_transform/CMakeLists.txt b/tests/e2e/linalg_transform/CMakeLists.txt
deleted file mode 100644
index 38c6124..0000000
--- a/tests/e2e/linalg_transform/CMakeLists.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-################################################################################
-# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
-# tests/e2e/linalg_transform/BUILD.bazel                                       #
-#                                                                              #
-# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
-# CMake-only content.                                                          #
-#                                                                              #
-# To disable autogeneration for this file entirely, delete this header.        #
-################################################################################
-
-iree_add_all_subdirs()
-
-iree_lit_test_suite(
-  NAME
-    check_linalg_transform
-  SRCS
-    "linalg_transform.mlir"
-  TOOLS
-    ${IREE_LLD_TARGET}
-    FileCheck
-    iree-opt
-    iree-run-mlir
-  DATA
-    iree::tests::e2e::linalg_transform::transform_dialect_codegen_spec.mlir
-    iree::tests::e2e::linalg_transform::transform_dialect_dispatch_spec.mlir
-  LABELS
-    "hostonly"
-)
-
-### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/tests/e2e/linalg_transform/linalg_transform.mlir b/tests/e2e/linalg_transform/linalg_transform.mlir
deleted file mode 100644
index 796ec22..0000000
--- a/tests/e2e/linalg_transform/linalg_transform.mlir
+++ /dev/null
@@ -1,45 +0,0 @@
-// R-UN: iree-run-mlir --Xcompiler,iree-hal-target-backends=llvm-cpu \
-/// Specify the dispatch region formation with the transform dialect.
-// R-UN:   --iree-flow-dispatch-use-transform-dialect=%p/transform_dialect_dispatch_spec.mlir \
-/// Specify the codegen strategy with the transform dialect.
-// R-UN:   --iree-codegen-use-transform-dialect-strategy=%p/transform_dialect_codegen_spec.mlir \
-// R-UN: %s | FileCheck %s
-
-
-// RUN: iree-opt %s \
-// RUN:   --iree-abi-transformation-pipeline \
-// RUN:   --iree-flow-transformation-pipeline \
-// RUN:   --iree-flow-dispatch-use-transform-dialect=%p/transform_dialect_dispatch_spec.mlir
-
-func.func @matmul_static() -> tensor<5x5xf32> {
-  %res = flow.tensor.constant dense<[
-    [0.0, 0.0, 0.0, 0.0, 0.0],
-    [0.0, 0.0, 0.0, 0.0, 0.0],
-    [0.0, 0.0, 0.0, 0.0, 0.0],
-    [0.0, 0.0, 0.0, 0.0, 0.0],
-    [0.0, 0.0, 0.0, 0.0, 0.0]]> : tensor<5x5xf32> -> tensor<5x5xf32>
-  %lhs = flow.tensor.constant dense<[
-    [15.0, 14.0, 13.0],
-    [12.0, 11.0, 10.0],
-    [09.0, 08.0, 07.0],
-    [06.0, 05.0, 04.0],
-    [03.0, 02.0, 01.0]]> : tensor<5x3xf32> -> tensor<5x3xf32>
-  %rhs = flow.tensor.constant dense<[
-    [15.0, 14.0, 13.0, 12.0, 11.0],
-    [10.0, 09.0, 08.0, 07.0, 06.0],
-    [05.0, 04.0, 03.0, 02.0, 01.0]]> : tensor<3x5xf32> -> tensor<3x5xf32>
-
-  %matmul = linalg.matmul
-      ins(%lhs, %rhs : tensor<5x3xf32>, tensor<3x5xf32>)
-      outs(%res : tensor<5x5xf32>) -> tensor<5x5xf32>
-  %matmul_res = util.optimization_barrier %matmul : tensor<5x5xf32>
-
-  return %matmul_res : tensor<5x5xf32>
-}
-
-//      CHECK: 5x5xf32=
-// CHECK-SAME: [430 388 346 304 262]
-// CHECK-SAME: [340 307 274 241 208]
-// CHECK-SAME: [250 226 202 178 154]
-// CHECK-SAME: [160 145 130 115 100]
-// CHECK-SAME: [70 64 58 52 46]
diff --git a/tests/e2e/linalg_transform/transform_dialect_codegen_spec.mlir b/tests/e2e/linalg_transform/transform_dialect_codegen_spec.mlir
deleted file mode 100644
index c95b85c..0000000
--- a/tests/e2e/linalg_transform/transform_dialect_codegen_spec.mlir
+++ /dev/null
@@ -1,5 +0,0 @@
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
-  %variant_op_2 = transform.iree.bufferize %variant_op
-  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_2 : (!transform.any_op) -> !transform.any_op
-}
diff --git a/tests/e2e/linalg_transform/transform_dialect_dispatch_spec.mlir b/tests/e2e/linalg_transform/transform_dialect_dispatch_spec.mlir
deleted file mode 100644
index 53fbec9..0000000
--- a/tests/e2e/linalg_transform/transform_dialect_dispatch_spec.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %tiled_op, %foreach_op = transform.structured.tile_using_forall %0 num_threads [13, 33]
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %dispatch_op = transform.iree.forall_to_flow %foreach_op : (!transform.any_op) -> !transform.any_op
-}
diff --git a/tests/transform_dialect/cpu/attention.mlir b/tests/transform_dialect/cpu/attention.mlir
index 9dd587f..b103ba6 100644
--- a/tests/transform_dialect/cpu/attention.mlir
+++ b/tests/transform_dialect/cpu/attention.mlir
@@ -9,8 +9,7 @@
 }
 
 // RUN: iree-compile %s --iree-hal-target-backends=llvm-cpu \
-// RUN: --iree-codegen-transform-dialect-library=%p/attention_codegen_spec.mlir \
-// RUN: --iree-codegen-use-transform-dialect-strategy=codegen | \
+// RUN: --iree-codegen-transform-dialect-library=%p/attention_codegen_spec.mlir@codegen | \
 // RUN: iree-run-module --module=- --function=attention | \
 // RUN: FileCheck %s --check-prefixes=EXEC
 
diff --git a/tests/transform_dialect/cpu/attention_codegen_spec.mlir b/tests/transform_dialect/cpu/attention_codegen_spec.mlir
index 5f22de1..ab64721 100644
--- a/tests/transform_dialect/cpu/attention_codegen_spec.mlir
+++ b/tests/transform_dialect/cpu/attention_codegen_spec.mlir
@@ -63,6 +63,11 @@
     } : !transform.any_op
     transform.apply_cse to %func_8 : !transform.any_op
     transform.memref.erase_dead_alloc_and_stores %func_8 : (!transform.any_op) -> ()
+
+    // Annotate the exported function as already translated.
+    %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
+    transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
     transform.yield
   } // codegen
 
diff --git a/tests/transform_dialect/cpu/matmul.mlir b/tests/transform_dialect/cpu/matmul.mlir
index 246b712..39a2074 100644
--- a/tests/transform_dialect/cpu/matmul.mlir
+++ b/tests/transform_dialect/cpu/matmul.mlir
@@ -11,20 +11,7 @@
 
 // RUN: iree-compile %s --iree-hal-target-backends=llvm-cpu \
 // RUN:   --iree-opt-data-tiling=false \
-// RUN:   --compile-to=executable-configurations | \
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs,iree-llvmcpu-lower-executable-target)))' \
-// RUN:   --iree-codegen-transform-dialect-library=%p/matmul_codegen_default_spec.mlir \
-// RUN:   --iree-codegen-use-transform-dialect-strategy=codegen | \
-// RUN: FileCheck %s --check-prefixes=CODEGEN-DEFAULT
-
-// CODEGEN-DEFAULT:     hal.executable.export public @matmul_static_dispatch_0_matmul_3x3x5
-// CODEGEN-DEFAULT-DAG:     %[[C1:.+]] = arith.constant 1 : index
-// CODEGEN-DEFAULT-DAG:     %[[C2:.+]] = arith.constant 2 : index
-// CODEGEN-DEFAULT:         hal.return %[[C2]], %[[C1]], %[[C1]]
-
-// RUN: iree-compile %s --iree-hal-target-backends=llvm-cpu \
-// RUN:   --iree-opt-data-tiling=false \
-// RUN:   --iree-codegen-use-transform-dialect-strategy=%p/matmul_codegen_default_spec.mlir | \
+// RUN:   --iree-codegen-transform-dialect-library=%p/matmul_codegen_default_spec.mlir@codegen | \
 // RUN: iree-run-module --module=- --function=matmul_static \
 // RUN:   --input="3x5xf32=1" \
 // RUN:   --input="5x3xf32=2" \
diff --git a/tests/transform_dialect/cpu/matmul_codegen_default_spec.mlir b/tests/transform_dialect/cpu/matmul_codegen_default_spec.mlir
index 34eaa2f..1fa3105 100644
--- a/tests/transform_dialect/cpu/matmul_codegen_default_spec.mlir
+++ b/tests/transform_dialect/cpu/matmul_codegen_default_spec.mlir
@@ -22,6 +22,11 @@
     // =========================================================
     %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
     transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> ()
+
+    // Annotate the exported function as already translated.
+    %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
+    transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
     transform.yield
   }
 } // module
diff --git a/tests/transform_dialect/cpu/matmul_library_call.mlir b/tests/transform_dialect/cpu/matmul_library_call.mlir
index e2f066a..ea93fbb 100644
--- a/tests/transform_dialect/cpu/matmul_library_call.mlir
+++ b/tests/transform_dialect/cpu/matmul_library_call.mlir
@@ -14,8 +14,7 @@
 
 // RUN: iree-compile %s --iree-hal-target-backends=llvm-cpu \
 // RUN:   --iree-opt-data-tiling=false \
-// RUN:   --iree-codegen-use-transform-dialect-strategy=custom_matmul \
-// RUN:   --iree-codegen-transform-dialect-library=%p/transform_library.mlir \
+// RUN:   --iree-codegen-transform-dialect-library=%p/transform_library.mlir@custom_matmul \
 // RUN:   --compile-to=executable-targets | \
 // RUN: FileCheck %s --check-prefixes=CODEGEN-DEFAULT
 
@@ -26,8 +25,7 @@
 
 // RUN: iree-compile %s --iree-hal-target-backends=llvm-cpu \
 // RUN:   --iree-opt-data-tiling=false \
-// RUN:   --iree-codegen-transform-dialect-library=%p/transform_library.mlir \
-// RUN:   --iree-codegen-use-transform-dialect-strategy=custom_matmul | \
+// RUN:   --iree-codegen-transform-dialect-library=%p/transform_library.mlir@custom_matmul | \
 // RUN: iree-run-module --module=- --function=matmul_static \
 // RUN:   --input="3x5xf32=1" \
 // RUN:   --input="5x3xf32=2" \
diff --git a/tests/transform_dialect/cuda/BUILD.bazel b/tests/transform_dialect/cuda/BUILD.bazel
index b8bf507..a35e0db 100644
--- a/tests/transform_dialect/cuda/BUILD.bazel
+++ b/tests/transform_dialect/cuda/BUILD.bazel
@@ -28,7 +28,6 @@
 iree_lit_test_suite(
     name = "lit",
     srcs = [
-        "mma.mlir",
         # TODO(#15892): reductions have flakes and need to be triaged.
         # "reduction.mlir",
         # "reduction_eltwise.mlir",
diff --git a/tests/transform_dialect/cuda/CMakeLists.txt b/tests/transform_dialect/cuda/CMakeLists.txt
index 347630b..7534bc8 100644
--- a/tests/transform_dialect/cuda/CMakeLists.txt
+++ b/tests/transform_dialect/cuda/CMakeLists.txt
@@ -21,8 +21,6 @@
 iree_lit_test_suite(
   NAME
     lit
-  SRCS
-    "mma.mlir"
   TOOLS
     FileCheck
     iree-compile
diff --git a/tests/transform_dialect/cuda/double_mma_layout_analysis.mlir b/tests/transform_dialect/cuda/double_mma_layout_analysis.mlir
index cb53367..9293fd4 100644
--- a/tests/transform_dialect/cuda/double_mma_layout_analysis.mlir
+++ b/tests/transform_dialect/cuda/double_mma_layout_analysis.mlir
@@ -15,8 +15,7 @@
 // RUN:     --iree-hal-cuda-llvm-target-arch=sm_80 \
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
 // RUN:     --iree-flow-dispatch-use-transform-dialect=%p/double_mma_layout_analysis_dispatch_spec.mlir \
-// RUN:     --iree-codegen-transform-dialect-library=%p/double_mma_layout_analysis_codegen_spec.mlir \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
+// RUN:     --iree-codegen-transform-dialect-library=%p/double_mma_layout_analysis_codegen_spec.mlir@codegen | \
 // RUN: iree-run-module --module=- --function=double_matmul --device=cuda \
 // RUN: --input="16x16xf16=[[0.0999755859375,0.2249755859375,0.07501220703125,0.0,0.07501220703125,0.2249755859375,0.175048828125,0.07501220703125,0.175048828125,0.07501220703125,0.024993896484375,0.1500244140625,0.1500244140625,0.2249755859375,0.199951171875,0.1500244140625],[0.1500244140625,0.199951171875,0.0999755859375,0.07501220703125,0.1500244140625,0.2249755859375,0.024993896484375,0.0999755859375,0.0999755859375,0.024993896484375,0.2249755859375,0.2249755859375,0.2249755859375,0.0,0.024993896484375,0.04998779296875],[0.07501220703125,0.0,0.125,0.125,0.04998779296875,0.2249755859375,0.024993896484375,0.199951171875,0.199951171875,0.07501220703125,0.1500244140625,0.2249755859375,0.024993896484375,0.175048828125,0.07501220703125,0.125],[0.04998779296875,0.024993896484375,0.0,0.2249755859375,0.07501220703125,0.024993896484375,0.024993896484375,0.0,0.07501220703125,0.1500244140625,0.1500244140625,0.175048828125,0.2249755859375,0.1500244140625,0.07501220703125,0.0999755859375],[0.125,0.0,0.199951171875,0.04998779296875,0.199951171875,0.04998779296875,0.175048828125,0.125,0.0,0.0,0.199951171875,0.024993896484375,0.2249755859375,0.1500244140625,0.024993896484375,0.0],[0.04998779296875,0.2249755859375,0.0999755859375,0.07501220703125,0.2249755859375,0.07501220703125,0.2249755859375,0.07501220703125,0.2249755859375,0.199951171875,0.125,0.07501220703125,0.04998779296875,0.199951171875,0.125,0.1500244140625],[0.1500244140625,0.125,0.175048828125,0.04998779296875,0.125,0.1500244140625,0.1500244140625,0.125,0.0999755859375,0.0,0.199951171875,0.024993896484375,0.175048828125,0.199951171875,0.125,0.0999755859375],[0.0999755859375,0.199951171875,0.0999755859375,0.0999755859375,0.2249755859375,0.0,0.175048828125,0.0999755859375,0.125,0.07501220703125,0.07501220703125,0.175048828125,0.07501220703125,0.0,0.2249755859375,0.2249755859375],[0.07501220703125,0.024993896484375,0.199951171875,0.024993896484375,0.175048828125,0.199951171875,0.0999755859375,0.024993896484375,0.0,0.0999755859375,0.0,0.0999755859375,0.2249755859375,0.175048828125,0.0,0.0],[0.024993896484375,0.0999755859375,0.2249755859375,0.2249755859375,0.125,0.2249755859375,0.04998779296875,0.04998779296875,0.04998779296875,0.024993896484375,0.0999755859375,0.2249755859375,0.024993896484375,0.024993896484375,0.0,0.07501220703125],[0.0,0.1500244140625,0.175048828125,0.1500244140625,0.2249755859375,0.024993896484375,0.1500244140625,0.0999755859375,0.024993896484375,0.0,0.125,0.04998779296875,0.125,0.199951171875,0.024993896484375,0.199951171875],[0.024993896484375,0.04998779296875,0.199951171875,0.0,0.07501220703125,0.199951171875,0.2249755859375,0.04998779296875,0.175048828125,0.0,0.199951171875,0.199951171875,0.1500244140625,0.199951171875,0.125,0.199951171875],[0.1500244140625,0.125,0.04998779296875,0.0999755859375,0.04998779296875,0.175048828125,0.04998779296875,0.0999755859375,0.2249755859375,0.199951171875,0.125,0.1500244140625,0.0999755859375,0.07501220703125,0.07501220703125,0.0999755859375],[0.0,0.04998779296875,0.125,0.024993896484375,0.04998779296875,0.199951171875,0.04998779296875,0.0999755859375,0.199951171875,0.07501220703125,0.1500244140625,0.125,0.199951171875,0.199951171875,0.0,0.125],[0.024993896484375,0.07501220703125,0.0,0.199951171875,0.024993896484375,0.024993896484375,0.024993896484375,0.175048828125,0.04998779296875,0.04998779296875,0.04998779296875,0.07501220703125,0.07501220703125,0.1500244140625,0.175048828125,0.199951171875],[0.0,0.125,0.0,0.07501220703125,0.125,0.125,0.07501220703125,0.1500244140625,0.04998779296875,0.04998779296875,0.125,0.125,0.2249755859375,0.0999755859375,0.07501220703125,0.07501220703125]]" \
 // RUN: --input="16x16xf16=[[0.175048828125,0.07501220703125,0.199951171875,0.0,0.175048828125,0.125,0.199951171875,0.04998779296875,0.0999755859375,0.175048828125,0.07501220703125,0.04998779296875,0.125,0.125,0.07501220703125,0.2249755859375],[0.024993896484375,0.199951171875,0.0,0.1500244140625,0.175048828125,0.0999755859375,0.175048828125,0.1500244140625,0.2249755859375,0.07501220703125,0.199951171875,0.0999755859375,0.0999755859375,0.2249755859375,0.0999755859375,0.0999755859375],[0.2249755859375,0.2249755859375,0.125,0.175048828125,0.0,0.07501220703125,0.04998779296875,0.0,0.199951171875,0.1500244140625,0.024993896484375,0.2249755859375,0.024993896484375,0.1500244140625,0.2249755859375,0.199951171875],[0.1500244140625,0.125,0.024993896484375,0.07501220703125,0.125,0.125,0.07501220703125,0.1500244140625,0.04998779296875,0.175048828125,0.125,0.175048828125,0.175048828125,0.07501220703125,0.024993896484375,0.125],[0.2249755859375,0.125,0.2249755859375,0.1500244140625,0.0,0.0,0.1500244140625,0.125,0.024993896484375,0.125,0.0,0.024993896484375,0.175048828125,0.175048828125,0.024993896484375,0.125],[0.2249755859375,0.024993896484375,0.04998779296875,0.0,0.0,0.1500244140625,0.07501220703125,0.2249755859375,0.1500244140625,0.024993896484375,0.0,0.0999755859375,0.125,0.1500244140625,0.2249755859375,0.0],[0.125,0.0999755859375,0.0,0.0999755859375,0.199951171875,0.125,0.175048828125,0.175048828125,0.1500244140625,0.2249755859375,0.04998779296875,0.125,0.1500244140625,0.0,0.0,0.0999755859375],[0.125,0.07501220703125,0.175048828125,0.1500244140625,0.175048828125,0.0,0.04998779296875,0.125,0.125,0.024993896484375,0.0999755859375,0.175048828125,0.024993896484375,0.0,0.024993896484375,0.0],[0.2249755859375,0.024993896484375,0.0999755859375,0.04998779296875,0.125,0.07501220703125,0.0999755859375,0.024993896484375,0.125,0.125,0.125,0.024993896484375,0.125,0.04998779296875,0.0999755859375,0.07501220703125],[0.0999755859375,0.175048828125,0.199951171875,0.0999755859375,0.175048828125,0.07501220703125,0.024993896484375,0.125,0.07501220703125,0.0,0.125,0.07501220703125,0.07501220703125,0.0,0.199951171875,0.175048828125],[0.07501220703125,0.0999755859375,0.175048828125,0.07501220703125,0.125,0.1500244140625,0.0,0.0999755859375,0.2249755859375,0.199951171875,0.04998779296875,0.0,0.0,0.1500244140625,0.199951171875,0.2249755859375],[0.024993896484375,0.2249755859375,0.04998779296875,0.1500244140625,0.2249755859375,0.2249755859375,0.175048828125,0.0999755859375,0.024993896484375,0.199951171875,0.125,0.199951171875,0.175048828125,0.2249755859375,0.175048828125,0.0999755859375],[0.125,0.0999755859375,0.04998779296875,0.125,0.199951171875,0.07501220703125,0.199951171875,0.0,0.024993896484375,0.04998779296875,0.0,0.04998779296875,0.04998779296875,0.199951171875,0.1500244140625,0.0999755859375],[0.199951171875,0.0,0.125,0.04998779296875,0.07501220703125,0.175048828125,0.0999755859375,0.175048828125,0.024993896484375,0.07501220703125,0.0,0.1500244140625,0.07501220703125,0.024993896484375,0.07501220703125,0.175048828125],[0.1500244140625,0.125,0.0999755859375,0.175048828125,0.04998779296875,0.0,0.04998779296875,0.1500244140625,0.024993896484375,0.125,0.125,0.175048828125,0.125,0.0999755859375,0.175048828125,0.1500244140625],[0.07501220703125,0.199951171875,0.024993896484375,0.0999755859375,0.175048828125,0.07501220703125,0.1500244140625,0.04998779296875,0.0,0.024993896484375,0.07501220703125,0.07501220703125,0.1500244140625,0.04998779296875,0.2249755859375,0.1500244140625]]" \
diff --git a/tests/transform_dialect/cuda/double_mma_layout_analysis_codegen_spec.mlir b/tests/transform_dialect/cuda/double_mma_layout_analysis_codegen_spec.mlir
index 5157fb7..02a1d92 100644
--- a/tests/transform_dialect/cuda/double_mma_layout_analysis_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/double_mma_layout_analysis_codegen_spec.mlir
@@ -68,6 +68,11 @@
     %func_10 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
     %func_11 = transform.iree.layout_analysis_and_distribution %func_10 : (!transform.any_op) -> (!transform.any_op)
 
+    // Annotate the exported function as already translated.
+    %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
+    transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
+
     transform.yield
   }
 } // module
diff --git a/tests/transform_dialect/cuda/eltwise_reduction.mlir b/tests/transform_dialect/cuda/eltwise_reduction.mlir
index d276879..eabf1a0 100644
--- a/tests/transform_dialect/cuda/eltwise_reduction.mlir
+++ b/tests/transform_dialect/cuda/eltwise_reduction.mlir
@@ -31,69 +31,9 @@
   return %6 : !out_tensor_t
 }
 
-// RUN: iree-opt %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-abi-transformation-pipeline \
-// RUN:     --iree-flow-transformation-pipeline \
-// RUN:     --iree-stream-transformation-pipeline \
-// RUN:     --iree-hal-configuration-pipeline | \
-// RUN: FileCheck %s --check-prefix=DISPATCH
-
-// RUN: iree-opt %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-abi-transformation-pipeline \
-// RUN:     --iree-flow-transformation-pipeline \
-// RUN:     --iree-stream-transformation-pipeline \
-// RUN:     --iree-hal-configuration-pipeline | \
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-lower-executable-target)))'
-// RUN:     --iree-codegen-use-transform-dialect-strategy=%p/%S_codegen_spec.mlir | \
-// RUN: FileCheck %s
-
 // RUN: iree-compile %s --iree-hal-target-backends=cuda | \
 // RUN: iree-run-module --module=- --function=reduce --device=cuda --input="8x64xf32=1" |\
 // RUN: FileCheck %s --check-prefix=EXEC
 
-// Check that both generics ended up in the same region.
-// DISPATCH:     hal.executable.variant
-// DISPATCH:     linalg.fill
-// DISPATCH-NOT: hal.executable.variant
-// DISPATCH:     linalg.generic
-// DISPATCH-NOT: hal.executable.variant
-// DISPATCH:     linalg.generic
-
-//     CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-//     CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
-//     CHECK-DAG: %[[F0:.*]] = arith.constant dense<0.000000e+00> : vector<f32>
-//     CHECK-DAG: %[[workgroup_id_x:.*]] = hal.interface.workgroup.id[0] : index
-//     CHECK-DAG: %[[SHMEM_ALLOC:.*]] = memref.alloc() {alignment = 128 : i64} : memref<1x2xf32, 3>
-//     CHECK-DAG: %[[TIDX:.]] = gpu.thread_id  x
-//     CHECK-DAG: %[[TIDY:.]] = gpu.thread_id  y
-//     CHECK-DAG: %[[TIDZ:.]] = gpu.thread_id  z
-
-//         CHECK: %[[SHMEM_VIEW_EXPANDED:.*]] = memref.subview %[[SHMEM_ALLOC]][%[[TIDZ]], %[[TIDY]]]{{.*}}to memref<f32, {{.*}}, 3>
-
-// Distributed reduction: everyone loads, does the elementwise then 5 xor + addf expected
-//         CHECK: vector.transfer_read %{{.*}}[%[[TIDZ]], %[[TIDY]], %[[TIDX]]]
-//         CHECK: arith.addf
-//         CHECK: arith.addf
-// CHECK-COUNT-5: gpu.shuffle  xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf
-
-//         CHECK: %[[RES:.*]] = arith.addf %{{.*}}
-
-//         CHECK: %[[RES_VEC:.*]] = vector.broadcast %[[RES]] : f32 to vector<f32>
-//         CHECK: %[[CONDXIS0:.*]] = arith.cmpi eq, %[[TIDX]], %[[C0]] : index
-//         CHECK: scf.if %[[CONDXIS0]]
-//         CHECK:   vector.transfer_write %[[RES_VEC]], %[[SHMEM_VIEW_EXPANDED]][]
-//         CHECK: gpu.barrier
-
-// Last part is not distributed atm and is only ran by threadIdx.x == 0 and threadIdx.y == 0.
-//         CHECK: %[[CONDYIS0:.*]] = arith.cmpi ult, %[[TIDY]], %[[C1]] : index
-//          TODO: cond eq 0 and cond ult 1 do not CSE atm.
-//         CHECK: %[[CONXANDYARE0:.*]] = arith.andi %{{.*}}, %[[CONDYIS0]] : i1
-//         CHECK: scf.if %[[CONXANDYARE0]] {
-//         CHECK:   vector.transfer_read
-//         CHECK:   vector.reduction <add>
-//         CHECK:   vector.transfer_write
-//         CHECK: gpu.barrier
-//         CHECK: memref.dealloc %[[SHMEM_ALLOC]] : memref<1x2xf32, 3>
-
 //      EXEC: result[0]: hal.buffer_view
 // EXEC-NEXT: 8xf32=256 256 256 256 256 256 256 256
diff --git a/tests/transform_dialect/cuda/eltwise_reduction_codegen_spec.mlir b/tests/transform_dialect/cuda/eltwise_reduction_codegen_spec.mlir
deleted file mode 100644
index 2f03915..0000000
--- a/tests/transform_dialect/cuda/eltwise_reduction_codegen_spec.mlir
+++ /dev/null
@@ -1,103 +0,0 @@
-// RUN: iree-opt %s
-
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
-  %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-
-  // Step 1. Split the reduction to get meatier (size(red) / 2)-way parallelism.
-  // ===========================================================================
-  %0 = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %eltwise, %reduction = transform.split_handle %0 : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %init_or_alloc_op, %more_parallel_fill_op, %more_parallel_op, %combiner_op =
-    transform.structured.split_reduction %reduction
-      { split_factor = 2, insert_split_dimension = 1 }
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-
-  // Step 2. First level of tiling + fusion parallelizes to blocks.
-  // ===========================================================================
-  %grid_combiner_op, %forall_grid =
-    transform.structured.tile_using_forall %combiner_op tile_sizes [1]
-      ( mapping = [#gpu.block<x>] )
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  // Step 2.1: Cannot fuse across the "expand_shape" produced by reduction
-  // splitting above, so we need to bubble that up via patterns and rematch
-  // the entire structure.
-  // TODO: bubbling should be a proper transform op, at which point we will be
-  // able to preserve the handles.
-  // ===========================================================================
-  %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %func {
-    transform.apply_patterns.iree.bubble_expand
-  } : !transform.any_op
-  %fills = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %fill_2, %more_parallel_fill_2 = transform.split_handle %fills
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %generics = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %expanded_eltwise, %more_parallel_2, %combiner_2 =
-    transform.split_handle %generics : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
-  %forall_grid_2 = transform.structured.match ops{["scf.forall"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %not_combiner = transform.merge_handles %fill_2, %more_parallel_fill_2, %more_parallel_2, %expanded_eltwise : !transform.any_op
-  transform.structured.fuse_into_containing_op %not_combiner into %forall_grid_2 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  // Step 3. Second level of tiling + fusion parallelizes to threads. Also
-  // fuse in the leading elementwise.
-  // ===========================================================================
-  %fill_1d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1xf32> in %variant_op : (!transform.any_op) -> !transform.any_op
-  %forall_block_combiner_op, %block_combiner_op =
-    transform.structured.tile_using_forall %combiner_2 tile_sizes [1]
-    ( mapping = [#gpu.thread<z>] )
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.structured.fuse_into_containing_op %fill_1d into %forall_block_combiner_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  %fill_2d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1x2xf32> in %variant_op : (!transform.any_op) -> !transform.any_op
-  %grid_more_parallel_op = transform.structured.match ops{["linalg.generic"]}
-    attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %grid_eltwise_op = transform.structured.match ops{["linalg.generic"]}
-    attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %forall_block_more_parallel_op, %block_more_parallel_op =
-    transform.structured.tile_using_forall %grid_more_parallel_op tile_sizes [1, 1]
-    ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.structured.fuse_into_containing_op %fill_2d into %forall_block_more_parallel_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.structured.fuse_into_containing_op %grid_eltwise_op into %forall_block_more_parallel_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  // Step 4. Rank-reduce and vectorize.
-  // ===========================================================================
-  %func_1 = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %func_1 {
-    transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-    transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-    transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-  } : !transform.any_op
-  %func_3 = transform.structured.vectorize_children_and_apply_patterns %func_1 : (!transform.any_op) -> !transform.any_op
-
-  // Step 5. Bufferize and drop HAL decriptor from memref ops.
-  // ===========================================================================
-  transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
-  %variant_op_2 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> !transform.any_op
-  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_2 : (!transform.any_op) -> !transform.any_op
-
-  // Step 6. Post-bufferization mapping to blocks and threads.
-  // ===========================================================================
-  %func_4 = transform.structured.match ops{["func.func"]} in %variant_op_2 : (!transform.any_op) -> !transform.any_op
-  transform.iree.forall_to_workgroup %func_4 : (!transform.any_op) -> ()
-  transform.iree.map_nested_forall_to_gpu_threads %func_4 workgroup_dims = [32, 2, 1] : (!transform.any_op) -> ()
-
-  // Step 7. Post-bufferization vector distribution with rank-reduction.
-  // ===========================================================================
-  transform.apply_patterns to %func_4 {
-    transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-    transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-    transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-  } : !transform.any_op
-  %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_2 : (!transform.any_op) -> !transform.any_op
-  // Don't complain about unsupported if (threadIdx.x == 0 && threadIdx.y == 0)
-  // at this point.
-  transform.sequence %variant_op_2 : !transform.any_op failures(suppress) {
-  ^bb0(%arg0: !transform.any_op):
-    transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 }
-    : (!transform.any_op) -> !transform.any_op
-  }
-  transform.iree.vector.warp_distribute %func_4 : (!transform.any_op) -> ()
-}
diff --git a/tests/transform_dialect/cuda/eltwise_reduction_eltwise.mlir b/tests/transform_dialect/cuda/eltwise_reduction_eltwise.mlir
index 3ad0c96..70aa332 100644
--- a/tests/transform_dialect/cuda/eltwise_reduction_eltwise.mlir
+++ b/tests/transform_dialect/cuda/eltwise_reduction_eltwise.mlir
@@ -43,72 +43,9 @@
   return %8 : !out_tensor_t
 }
 
-// RUN: iree-opt %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-abi-transformation-pipeline \
-// RUN:     --iree-flow-transformation-pipeline \
-// RUN:     --iree-stream-transformation-pipeline \
-// RUN:     --iree-hal-configuration-pipeline | \
-// RUN: FileCheck %s --check-prefix=DISPATCH
-
-// RUN: iree-opt %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-abi-transformation-pipeline \
-// RUN:     --iree-flow-transformation-pipeline \
-// RUN:     --iree-stream-transformation-pipeline \
-// RUN:     --iree-hal-configuration-pipeline | \
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-lower-executable-target)))'
-// RUN:     --iree-codegen-use-transform-dialect-strategy=%p/%S_codegen_spec.mlir | \
-// RUN: FileCheck %s
-
 // RUN: iree-compile %s --iree-hal-target-backends=cuda | \
 // RUN: iree-run-module --module=- --function=reduce --device=cuda --input="8x64xf32=1" |\
 // RUN: FileCheck %s --check-prefix=EXEC
 
-// Check that all generics ended up in the same region.
-// DISPATCH:     hal.executable.variant
-// DISPATCH:     linalg.fill
-// DISPATCH-NOT: hal.executable.variant
-// DISPATCH:     linalg.generic
-// DISPATCH-NOT: hal.executable.variant
-// DISPATCH:     linalg.generic
-// DISPATCH-NOT: hal.executable.variant
-// DISPATCH:     linalg.generic
-
-//     CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-//     CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
-//     CHECK-DAG: %[[F0:.*]] = arith.constant dense<0.000000e+00> : vector<f32>
-//     CHECK-DAG: %[[workgroup_id_x:.*]] = hal.interface.workgroup.id[0] : index
-//     CHECK-DAG: %[[SHMEM_ALLOC:.*]] = memref.alloc() {alignment = 128 : i64} : memref<1x2xf32, 3>
-//     CHECK-DAG: %[[TIDX:.]] = gpu.thread_id  x
-//     CHECK-DAG: %[[TIDY:.]] = gpu.thread_id  y
-//     CHECK-DAG: %[[TIDZ:.]] = gpu.thread_id  z
-
-//         CHECK: %[[SHMEM_VIEW_EXPANDED:.*]] = memref.subview %[[SHMEM_ALLOC]][%[[TIDZ]], %[[TIDY]]]{{.*}}to memref<f32, {{.*}}, 3>
-
-// Distributed reduction: everyone loads, does the elementwise then 5 xor + addf expected
-//         CHECK: vector.transfer_read %{{.*}}[%[[TIDZ]], %[[TIDY]], %[[TIDX]]]
-//         CHECK: arith.addf
-//         CHECK: arith.addf
-// CHECK-COUNT-5: gpu.shuffle  xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf
-
-//         CHECK: %[[RES:.*]] = arith.addf %{{.*}}
-
-//         CHECK: %[[RES_VEC:.*]] = vector.broadcast %[[RES]] : f32 to vector<f32>
-//         CHECK: %[[CONDXIS0:.*]] = arith.cmpi eq, %[[TIDX]], %[[C0]] : index
-//         CHECK: scf.if %[[CONDXIS0]]
-//         CHECK:   vector.transfer_write %[[RES_VEC]], %[[SHMEM_VIEW_EXPANDED]][]
-//         CHECK: gpu.barrier
-
-// Last part is not distributed atm and is only ran by threadIdx.x == 0 and threadIdx.y == 0.
-//         CHECK: %[[CONDYIS0:.*]] = arith.cmpi ult, %[[TIDY]], %[[C1]] : index
-//          TODO: cond eq 0 and cond ult 1 do not CSE atm.
-//         CHECK: %[[CONXANDYARE0:.*]] = arith.andi %{{.*}}, %[[CONDYIS0]] : i1
-//         CHECK: scf.if %[[CONXANDYARE0]] {
-//         CHECK:   vector.transfer_read
-//         CHECK:   vector.reduction <add>
-//         CHECK:   math.sqrt
-//         CHECK:   vector.transfer_write
-//         CHECK: gpu.barrier
-//         CHECK: memref.dealloc %[[SHMEM_ALLOC]] : memref<1x2xf32, 3>
-
 //      EXEC: result[0]: hal.buffer_view
 // EXEC-NEXT: 8xf32=16 16 16 16 16 16 16 16
diff --git a/tests/transform_dialect/cuda/eltwise_reduction_eltwise_codegen_spec.mlir b/tests/transform_dialect/cuda/eltwise_reduction_eltwise_codegen_spec.mlir
deleted file mode 100644
index 0c55aa3..0000000
--- a/tests/transform_dialect/cuda/eltwise_reduction_eltwise_codegen_spec.mlir
+++ /dev/null
@@ -1,111 +0,0 @@
-// RUN: iree-opt %s
-
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
-  %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-
-  // Step 1. Split the reduction to get meatier (size(red) / 2)-way parallelism.
-  // ===========================================================================
-  %0 = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %leading_eltwise, %reduction, %trailing_eltwise = transform.split_handle %0
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
-  %init_or_alloc_op, %more_parallel_fill_op, %more_parallel_op, %combiner_op =
-    transform.structured.split_reduction %reduction
-      { split_factor = 2, insert_split_dimension = 1 }
-       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-
-  // Step 2. First level of tiling + fusion parallelizes to blocks. Tile the
-  // trailing elementwise the same way we want to tile the reduction.
-  // ===========================================================================
-  %trailing_eltwise_grid_op, %grid_loop =
-    transform.structured.tile_using_forall %trailing_eltwise tile_sizes [1]
-      ( mapping = [#gpu.block<x>] )
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  // Step 2.1: Cannot fuse across the "expand_shape" produced by reduction
-  // splitting above, so we need to bubble that up via patterns and rematch
-  // the entire structure.
-  // TODO: bubbling should be a proper transform op, at which point we will be
-  // able to preserve the handles.
-  // ===========================================================================
-  %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %func {
-    transform.apply_patterns.iree.bubble_expand
-  } : !transform.any_op
-  %fills = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %fill_2, %more_parallel_fill_2 = transform.split_handle %fill
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %generics = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %expanded_eltwise, %more_parallel_2, %combiner_2, %trailing_eltwise_2 =
-    transform.split_handle %generics
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-  %forall_grid_2 = transform.structured.match ops{["scf.forall"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %not_trailing = transform.merge_handles %fill_2, %more_parallel_fill_2,
-    %more_parallel_2, %expanded_eltwise, %combiner_2 : !transform.any_op
-  transform.structured.fuse_into_containing_op %not_trailing into %forall_grid_2 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  // Step 3. Second level of tiling + fusion parallelizes to threads. Also
-  // fuse in the leading and trailing elementwise.
-  // ===========================================================================
-  %fill_1d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1xf32> in %variant_op : (!transform.any_op) -> !transform.any_op
-  %block_trailing_eltwise_op, %forall_trailing_eltwise_op =
-    transform.structured.tile_using_forall %trailing_eltwise_2 tile_sizes [1]
-    ( mapping = [#gpu.thread<z>] )
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %block_combiner_op = transform.structured.match ops{["linalg.generic"]}
-    attributes {iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %fill_and_reduction = transform.merge_handles %fill_1d, %block_combiner_op : !transform.any_op
-  transform.structured.fuse_into_containing_op %fill_and_reduction into %forall_trailing_eltwise_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  %fill_2d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1x2xf32> in %variant_op : (!transform.any_op) -> !transform.any_op
-  %grid_more_parallel_op = transform.structured.match ops{["linalg.generic"]}
-    attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %grid_eltwise_op = transform.structured.match ops{["linalg.generic"]}
-    attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %block_more_parallel_op, %forall_block_more_parallel_op =
-    transform.structured.tile_using_forall %grid_more_parallel_op tile_sizes [1, 1]
-    ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.structured.fuse_into_containing_op %fill_2d into %forall_block_more_parallel_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.structured.fuse_into_containing_op %grid_eltwise_op into %forall_block_more_parallel_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  // Step 4. Rank-reduce and vectorize.
-  // ===========================================================================
-  %func_1 = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %func_1 {
-    transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-    transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-    transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-  } : !transform.any_op
-  %func_2 = transform.structured.vectorize_children_and_apply_patterns %func_1 : (!transform.any_op) -> !transform.any_op
-
-  // Step 5. Bufferize and drop HAL decriptor from memref ops.
-  // ===========================================================================
-  transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
-  %variant_op_2 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> !transform.any_op
-  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_2 : (!transform.any_op) -> !transform.any_op
-
-  // Step 6. Post-bufferization mapping to blocks and threads.
-  // ===========================================================================
-  %func_3 = transform.structured.match ops{["func.func"]} in %variant_op_2 : (!transform.any_op) -> !transform.any_op
-  transform.iree.forall_to_workgroup %func_3 : (!transform.any_op) -> ()
-  transform.iree.map_nested_forall_to_gpu_threads %func_3 workgroup_dims = [32, 2, 1] : (!transform.any_op) -> ()
-
-  // Step 7. Post-bufferization vector distribution with rank-reduction.
-  // ===========================================================================
-  transform.apply_patterns to %func_3 {
-    transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-    transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-    transform.apply_patterns.memref.fold_memref_alias_ops
-    transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-  } : !transform.any_op
-  %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_2 : (!transform.any_op) -> !transform.any_op
-  // Don't complain about unsupported if (threadIdx.x == 0 && threadIdx.y == 0)
-  // at this point.
-  transform.sequence %variant_op_2 : !transform.any_op failures(suppress) {
-  ^bb0(%arg0: !transform.any_op):
-    transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 }
-    : (!transform.any_op) -> !transform.any_op
-  }
-  transform.iree.vector.warp_distribute %func_3 : (!transform.any_op) -> ()
-}
diff --git a/tests/transform_dialect/cuda/mma_elemwise_layout_analysis_codegen_spec.mlir b/tests/transform_dialect/cuda/mma_elemwise_layout_analysis_codegen_spec.mlir
index c60c261..d1362d0 100644
--- a/tests/transform_dialect/cuda/mma_elemwise_layout_analysis_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/mma_elemwise_layout_analysis_codegen_spec.mlir
@@ -3,66 +3,71 @@
 module attributes { transform.with_named_sequence } {
   transform.named_sequence @__transform_main(
       %variant_op: !transform.any_op {transform.consumed}) {
-    // Step 1. Find the fill, matmul and generic ops
-    // ===========================================================================
-    %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    %matmul = transform.structured.match ops{["linalg.generic"]}
-                attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]}
-                in %variant_op : (!transform.any_op) -> !transform.any_op
-    %generic = transform.structured.match ops{["linalg.generic"]}
-                attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>]}
-                in %variant_op : (!transform.any_op) -> !transform.any_op
+      // Step 1. Find the fill, matmul and generic ops
+      // ===========================================================================
+      %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+      %matmul = transform.structured.match ops{["linalg.generic"]}
+                  attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]}
+                  in %variant_op : (!transform.any_op) -> !transform.any_op
+      %generic = transform.structured.match ops{["linalg.generic"]}
+                  attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>]}
+                  in %variant_op : (!transform.any_op) -> !transform.any_op
 
-    // Step 2. Tile the generic and fuse the fill and matmul
-    // ===========================================================================
-    %grid_reduction, %forall_grid =
-    transform.structured.tile_using_forall %generic tile_sizes [16] ( mapping = [#gpu.block<x>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-    transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
+      // Step 2. Tile the generic and fuse the fill and matmul
+      // ===========================================================================
+      %grid_reduction, %forall_grid =
+      transform.structured.tile_using_forall %generic tile_sizes [16] ( mapping = [#gpu.block<x>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+      transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
 
-    transform.structured.fuse_into_containing_op %matmul into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-    transform.structured.fuse_into_containing_op %fill into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+      transform.structured.fuse_into_containing_op %matmul into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+      transform.structured.fuse_into_containing_op %fill into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
 
-    // Step 3. Vectorize
-    // ===========================================================================
-    %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %func {
-      transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-      transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-      transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-    } : !transform.any_op
-    %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
+      // Step 3. Vectorize
+      // ===========================================================================
+      %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+      transform.apply_patterns to %func {
+        transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+        transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+        transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+      } : !transform.any_op
+      %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
 
-    // Step 4. Bufferize
-    // ===========================================================================
-    transform.apply_patterns to %func_3 {
-      transform.apply_patterns.iree.fold_fill_into_pad
-      transform.apply_patterns.linalg.tiling_canonicalization
-      transform.apply_patterns.scf.for_loop_canonicalization
-    } : !transform.any_op
-    transform.apply_patterns to %func_3 {
-      transform.apply_patterns.tensor.reassociative_reshape_folding
-      transform.apply_patterns.canonicalization
-    } : !transform.any_op
-    transform.apply_cse to %func_3 : !transform.any_op
-    transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
-    transform.apply_patterns to %func_3 {
-      transform.apply_patterns.linalg.erase_unnecessary_inputs
-    } : !transform.any_op
-    %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
-    %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+      // Step 4. Bufferize
+      // ===========================================================================
+      transform.apply_patterns to %func_3 {
+        transform.apply_patterns.iree.fold_fill_into_pad
+        transform.apply_patterns.linalg.tiling_canonicalization
+        transform.apply_patterns.scf.for_loop_canonicalization
+      } : !transform.any_op
+      transform.apply_patterns to %func_3 {
+        transform.apply_patterns.tensor.reassociative_reshape_folding
+        transform.apply_patterns.canonicalization
+      } : !transform.any_op
+      transform.apply_cse to %func_3 : !transform.any_op
+      transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+      transform.apply_patterns to %func_3 {
+        transform.apply_patterns.linalg.erase_unnecessary_inputs
+      } : !transform.any_op
+      %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
+      %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
 
-    // Step 6. Post-bufferization vector distribution
-    // ===========================================================================
-    %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-    transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
-    transform.iree.map_nested_forall_to_gpu_threads %func_7
-        workgroup_dims = [4, 8, 1] : (!transform.any_op) -> ()
+      // Step 6. Post-bufferization vector distribution
+      // ===========================================================================
+      %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+      transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
+      transform.iree.map_nested_forall_to_gpu_threads %func_7
+          workgroup_dims = [4, 8, 1] : (!transform.any_op) -> ()
 
-    // Step 7. Do layout analysis and lower to mma
-    // ===========================================================================
-    %func_10 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-    %func_11 = transform.iree.layout_analysis_and_distribution %func_10 : (!transform.any_op) -> (!transform.any_op)
-    transform.yield
-  }
+      // Step 7. Do layout analysis and lower to mma
+      // ===========================================================================
+      %func_10 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+      %func_11 = transform.iree.layout_analysis_and_distribution %func_10 : (!transform.any_op) -> (!transform.any_op)
+
+      // Annotate the exported function as already translated.
+      %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+      %none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
+      transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
+      transform.yield
+    }
 } // module
 
diff --git a/tests/transform_dialect/cuda/mma_reduction_layout_analysis.mlir b/tests/transform_dialect/cuda/mma_reduction_layout_analysis.mlir
index a99b19d..f01f07c 100644
--- a/tests/transform_dialect/cuda/mma_reduction_layout_analysis.mlir
+++ b/tests/transform_dialect/cuda/mma_reduction_layout_analysis.mlir
@@ -27,8 +27,7 @@
 // RUN:     --iree-hal-cuda-llvm-target-arch=sm_80 \
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
 // RUN:     --iree-flow-dispatch-use-transform-dialect=%p/mma_reduction_layout_analysis_dispatch_spec.mlir \
-// RUN:     --iree-codegen-transform-dialect-library=%p/mma_reduction_layout_analysis_codegen_spec.mlir \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
+// RUN:     --iree-codegen-transform-dialect-library=%p/mma_reduction_layout_analysis_codegen_spec.mlir@codegen | \
 // RUN: iree-run-module --module=- --function=matmul_reduction --device=cuda \
 // RUN: --input="16x16xf16=[[3.0,2.0,2.5,4.5,1.5,4.0,2.0,2.5,4.0,4.0,1.5,0.5,2.0,3.0,0.5,2.0],[2.5,2.5,0.5,3.5,0.0,2.5,3.5,1.0,0.5,0.0,3.0,4.5,0.5,0.5,0.0,3.5],[4.5,3.0,4.0,2.5,1.0,0.5,0.0,4.5,0.0,2.5,3.5,0.0,2.0,4.5,1.5,4.5],[0.0,2.0,1.5,0.0,2.0,1.5,3.0,2.0,2.0,4.0,4.0,2.5,0.0,3.0,2.0,0.5],[0.5,3.5,3.0,2.5,0.0,2.5,3.0,3.0,4.5,2.0,2.0,1.0,2.0,1.0,3.5,2.0],[0.0,4.5,2.0,4.0,2.5,2.5,1.5,1.5,1.5,3.0,3.0,0.0,2.5,0.5,2.0,2.0],[3.5,4.0,3.5,1.5,2.0,0.5,1.0,2.5,4.0,3.5,0.0,3.0,0.0,1.5,4.5,0.0],[4.5,3.5,1.0,4.5,0.5,0.0,1.5,4.5,1.5,3.5,3.0,2.5,0.0,0.5,0.0,4.0],[2.0,3.0,0.5,2.0,1.5,0.5,2.0,2.5,2.5,4.0,2.0,4.5,4.0,0.0,2.0,3.0],[2.5,4.0,4.0,3.0,2.0,2.0,4.5,0.5,4.5,1.0,2.0,0.0,4.5,1.0,3.0,0.5],[4.0,1.5,3.5,3.0,2.5,4.5,1.0,3.5,3.0,2.5,2.5,2.0,2.0,4.5,1.5,2.5],[3.0,3.0,0.0,2.5,1.0,3.0,0.0,1.5,1.5,2.5,0.5,1.0,3.0,3.5,1.5,1.5],[0.0,4.5,0.5,1.5,0.5,4.0,3.5,4.0,4.0,0.0,0.5,1.0,4.5,1.5,0.0,3.5],[2.5,2.0,2.5,1.5,3.0,0.0,2.0,1.0,2.5,4.0,0.0,4.0,4.0,1.5,3.0,2.5],[3.0,0.0,4.0,4.0,2.0,0.5,1.0,3.5,4.0,2.5,4.0,4.5,0.0,3.0,1.5,2.5],[0.5,0.5,2.5,4.0,1.0,2.5,0.5,4.5,2.0,3.0,1.5,4.5,1.5,4.5,0.5,1.5]]" \
 // RUN: --input="16x16xf16=[[3.5,3.0,4.5,3.0,3.0,0.0,2.0,2.5,2.0,0.0,4.5,2.5,0.5,0.0,4.0,3.5],[0.0,0.5,2.0,4.5,0.0,4.0,1.5,3.5,0.5,2.5,3.5,1.5,3.5,4.5,4.0,3.0],[3.0,3.5,2.5,1.5,1.5,1.5,0.5,4.5,0.0,3.5,4.0,0.0,0.0,2.0,0.5,1.0],[1.5,4.0,3.5,3.5,0.0,0.0,0.0,2.0,3.0,1.5,0.0,3.0,0.0,2.5,2.0,3.0],[3.5,4.0,2.5,1.5,3.0,2.0,3.0,4.5,1.5,3.0,2.0,3.5,2.5,4.5,0.5,3.5],[0.0,0.0,0.0,0.5,1.0,2.5,1.5,1.0,2.5,1.5,0.0,1.5,1.5,2.0,4.5,2.5],[4.0,1.5,3.0,2.5,2.5,3.5,2.0,4.0,1.5,2.5,0.5,4.0,1.0,4.5,3.5,0.0],[1.0,2.0,4.0,4.5,4.5,3.5,0.0,1.0,4.5,3.5,2.0,3.0,0.5,4.0,3.5,1.5],[1.0,0.0,2.5,4.5,0.0,2.0,0.0,2.5,3.0,4.0,2.5,0.5,3.5,0.0,3.5,1.0],[0.0,3.5,4.0,0.0,0.0,4.5,1.0,3.5,1.5,3.0,2.0,1.0,0.5,0.5,2.0,0.0],[1.5,0.0,4.5,2.0,4.5,4.5,3.5,3.0,2.5,4.5,0.5,0.5,0.0,4.5,0.0,4.0],[4.5,3.5,4.0,4.0,1.5,4.0,1.0,4.0,2.5,0.5,4.5,3.5,3.5,0.5,4.5,3.0],[0.0,3.0,2.5,1.0,1.5,2.0,1.0,1.5,4.0,2.5,3.5,1.0,3.5,2.5,3.5,4.5],[1.5,4.5,2.0,2.0,2.0,0.5,4.0,2.0,4.0,3.5,4.0,1.0,1.5,2.5,1.0,0.0],[0.0,0.0,1.0,2.5,3.5,2.5,4.0,0.0,2.0,2.0,4.5,0.5,1.0,3.5,3.0,2.5],[2.0,2.0,0.5,2.0,4.5,2.5,3.0,1.5,4.5,2.0,3.5,3.0,1.0,2.0,1.5,2.0]]" |\
diff --git a/tests/transform_dialect/cuda/mma_reduction_layout_analysis_codegen_spec.mlir b/tests/transform_dialect/cuda/mma_reduction_layout_analysis_codegen_spec.mlir
index d72accb..a7c32a4 100644
--- a/tests/transform_dialect/cuda/mma_reduction_layout_analysis_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/mma_reduction_layout_analysis_codegen_spec.mlir
@@ -66,6 +66,10 @@
     %func_10 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
     %func_11 = transform.iree.layout_analysis_and_distribution %func_10 : (!transform.any_op) -> (!transform.any_op)
 
+    // Annotate the exported function as already translated.
+    %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
+    transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
     transform.yield
   }
 } // module
diff --git a/tests/transform_dialect/cuda/mma_using_layout_analysis_codegen_spec.mlir b/tests/transform_dialect/cuda/mma_using_layout_analysis_codegen_spec.mlir
index 33bfe44..83e2496 100644
--- a/tests/transform_dialect/cuda/mma_using_layout_analysis_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/mma_using_layout_analysis_codegen_spec.mlir
@@ -70,6 +70,11 @@
     // ===========================================================================
     %func_10 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
     %func_11 = transform.iree.layout_analysis_and_distribution %func_10 : (!transform.any_op) -> (!transform.any_op)
+
+    // Annotate the exported function as already translated.
+    %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
+    transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
     transform.yield
   }
 } // module
diff --git a/tests/transform_dialect/cuda/reduction.mlir b/tests/transform_dialect/cuda/reduction.mlir
index 2642b03..d506a17 100644
--- a/tests/transform_dialect/cuda/reduction.mlir
+++ b/tests/transform_dialect/cuda/reduction.mlir
@@ -18,24 +18,9 @@
   return %2 : !out_tensor_t
 }
 
-// RUN: iree-opt %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-abi-transformation-pipeline \
-// RUN:     --iree-flow-transformation-pipeline  \
-// RUN:     --iree-stream-transformation-pipeline \
-// RUN:     --iree-hal-configuration-pipeline | \
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-lower-executable-target)))' \
-// RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-transform-dialect-library=%p/reduction_codegen_spec.mlir \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
-// RUN: FileCheck %s --check-prefix=CHECK
-
 // RUN: iree-compile %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-opt-const-expr-hoisting=false --iree-opt-const-eval=false \
-/// Constant JIT'ing must be disabled because the transform-dialect debug
-/// flags leak to the JIT session, which doesn't know what to do with them.
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-transform-dialect-library=%p/reduction_codegen_spec.mlir \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
+// RUN:     --iree-codegen-transform-dialect-library=%p/reduction_codegen_spec.mlir@codegen | \
 // RUN: iree-run-module --module=- --function=reduce --device=cuda --input="8x64xf32=1" |\
 // RUN: FileCheck %s --check-prefix=EXEC
 
@@ -44,34 +29,5 @@
 // RUN: iree-run-module --module=- --function=reduce --device=cuda --input="8x64xf32=1" |\
 // RUN: FileCheck %s --check-prefix=EXEC
 
-  //     CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-  //     CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
-  //     CHECK-DAG: %[[workgroup_id_x:.*]] = hal.interface.workgroup.id[0] : index
-  //     CHECK-DAG: %[[SHMEM_ALLOC:.*]] = memref.alloc() {alignment = 64 : i64} : memref<1x2xf32, #gpu.address_space<workgroup>>
-  //     CHECK-DAG: %[[TIDX:.]] = gpu.thread_id  x
-  //     CHECK-DAG: %[[TIDY:.]] = gpu.thread_id  y
-  //     CHECK-DAG: %[[CONDXIS0:.*]] = arith.cmpi eq, %[[TIDX]], %[[C0]] : index
-
-  // Distributed reduction: everyone loads then 5 xor + addf expected
-  //         CHECK: vector.transfer_read %{{.*}}[%[[workgroup_id_x]], %[[TIDY]], %[[TIDX]]]
-  // CHECK-COUNT-5: gpu.shuffle  xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf
-
-  //         CHECK: %[[RES:.*]] = arith.addf %{{.*}}
-
-  //         CHECK: %[[RES_VEC:.*]] = vector.broadcast %[[RES]] : f32 to vector<f32>
-  //         CHECK: scf.if %[[CONDXIS0]]
-  //         CHECK:   vector.transfer_write %[[RES_VEC]], %[[SHMEM_ALLOC]][%[[C0]], %[[TIDY]]]
-  //         CHECK: gpu.barrier
-
-  // Last part is not distributed atm and is only ran by threadIdx.x == 0 and threadIdx.y == 0.
-  //         CHECK: %[[CONDYIS0:.*]] = arith.cmpi ult, %[[TIDY]], %[[C1]] : index
-  //          TODO: cond eq 0 and cond ult 1 do not CSE atm.
-  //         CHECK: %[[CONXANDYARE0:.*]] = arith.andi %{{.*}}, %[[CONDYIS0]] : i1
-  //         CHECK: scf.if %[[CONXANDYARE0]] {
-  //         CHECK:   vector.transfer_read
-  //         CHECK:   vector.reduction <add>
-  //         CHECK:   vector.transfer_write
-  //         CHECK: gpu.barrier
-
 //      EXEC: result[0]: hal.buffer_view
 // EXEC-NEXT: 8xf32=64 64 64 64 64 64 64 64
diff --git a/tests/transform_dialect/cuda/reduction_codegen_spec.mlir b/tests/transform_dialect/cuda/reduction_codegen_spec.mlir
index 364f1a6..c8b46b3 100644
--- a/tests/transform_dialect/cuda/reduction_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/reduction_codegen_spec.mlir
@@ -115,6 +115,11 @@
     transform.iree.apply_licm %func_op_3 : !transform.any_op
     transform.apply_cse to %func_op_3 : !transform.any_op
 
+    // Annotate the exported function as already translated.
+    %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
+    transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
+
     transform.yield
   }
 } // module
diff --git a/tests/transform_dialect/cuda/reduction_eltwise.mlir b/tests/transform_dialect/cuda/reduction_eltwise.mlir
index a266998..0bd49b5 100644
--- a/tests/transform_dialect/cuda/reduction_eltwise.mlir
+++ b/tests/transform_dialect/cuda/reduction_eltwise.mlir
@@ -29,59 +29,17 @@
   return %7 : !out_tensor_t
 }
 
-// RUN: iree-opt %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-abi-transformation-pipeline \
-// RUN:     --iree-flow-transformation-pipeline  \
-// RUN:     --iree-stream-transformation-pipeline \
-// RUN:     --iree-hal-configuration-pipeline | \
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-lower-executable-target)))' \
-// RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-transform-dialect-library=%p/reduction_eltwise_codegen_spec.mlir \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
-// RUN: FileCheck %s --check-prefix=CHECK
-
 // RUN: iree-compile %s --iree-hal-target-backends=cuda \
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-transform-dialect-library=%p/reduction_eltwise_codegen_spec.mlir \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
+// RUN:     --iree-codegen-transform-dialect-library=%p/reduction_eltwise_codegen_spec.mlir@codegen | \
 // RUN: iree-run-module --module=- --function=reduce --device=cuda --input="8x64xf32=1" |\
 // RUN: FileCheck %s --check-prefix=EXEC
 
-/// Note: the current --iree-codegen-llvmgpu-enable-transform-dialect-jit only works for exactly this reduction atm.
+/// Note: the current --iree-codegen-llvmgpu-enable-transform-dialect-jit
+/// only works for exactly this reduction atm.
 // RUN: iree-compile %s --iree-hal-target-backends=cuda | \
 // RUN: iree-run-module --module=- --function=reduce --device=cuda --input="8x64xf32=1" |\
 // RUN: FileCheck %s --check-prefix=EXEC
 
-  //     CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-  //     CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
-  //     CHECK-DAG: %[[workgroup_id_x:.*]] = hal.interface.workgroup.id[0] : index
-  //     CHECK-DAG: %[[SHMEM_ALLOC:.*]] = memref.alloc() {alignment = 64 : i64} : memref<1x2xf32, #gpu.address_space<workgroup>>
-  //     CHECK-DAG: %[[TIDX:.]] = gpu.thread_id  x
-  //     CHECK-DAG: %[[TIDY:.]] = gpu.thread_id  y
-  //     CHECK-DAG: %[[CONDXIS0:.*]] = arith.cmpi eq, %[[TIDX]], %[[C0]] : index
-
-  // Distributed reduction: everyone loads then 5 xor + addf expected
-  //         CHECK: vector.transfer_read %{{.*}}[%[[workgroup_id_x]], %[[TIDY]], %[[TIDX]]]
-  // CHECK-COUNT-5: gpu.shuffle  xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf
-
-  //         CHECK: %[[RES:.*]] = arith.addf %{{.*}}
-
-  //         CHECK: %[[RES_VEC:.*]] = vector.broadcast %[[RES]] : f32 to vector<f32>
-  //         CHECK: scf.if %[[CONDXIS0]]
-  //         CHECK:   vector.transfer_write %[[RES_VEC]], %[[SHMEM_ALLOC]][%[[C0]], %[[TIDY]]]
-  //         CHECK: gpu.barrier
-
-  // Last part is not distributed atm and is only ran by threadIdx.x == 0 and threadIdx.y == 0.
-  // It should contain the fused elementwise operation.
-  //         CHECK: %[[CONDYIS0:.*]] = arith.cmpi ult, %[[TIDY]], %[[C1]] : index
-  //          TODO: cond eq 0 and cond ult 1 do not CSE atm.
-  //         CHECK: %[[CONXANDYARE0:.*]] = arith.andi %{{.*}}, %[[CONDYIS0]] : i1
-  //         CHECK: scf.if %[[CONXANDYARE0]] {
-  //         CHECK:   vector.transfer_read
-  //         CHECK:   vector.reduction <add>
-  //         CHECK:   math.sqrt
-  //         CHECK:   vector.transfer_write
-  //         CHECK: gpu.barrier
-
 //      EXEC: result[0]: hal.buffer_view
 // EXEC-NEXT: 8xf32=8 8 8 8 8 8 8 8
diff --git a/tests/transform_dialect/cuda/reduction_eltwise_codegen_spec.mlir b/tests/transform_dialect/cuda/reduction_eltwise_codegen_spec.mlir
index aad683d..42a5848 100644
--- a/tests/transform_dialect/cuda/reduction_eltwise_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/reduction_eltwise_codegen_spec.mlir
@@ -154,6 +154,11 @@
     transform.iree.apply_licm %func_op_3 : !transform.any_op
     transform.apply_cse to %func_op_3 : !transform.any_op
 
+    // Annotate the exported function as already translated.
+    %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
+    transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
+
     transform.yield
   }
 } // module
diff --git a/tests/transform_dialect/cuda/reduction_v2.mlir b/tests/transform_dialect/cuda/reduction_v2.mlir
index 6ff6442..6d367d2 100644
--- a/tests/transform_dialect/cuda/reduction_v2.mlir
+++ b/tests/transform_dialect/cuda/reduction_v2.mlir
@@ -18,21 +18,9 @@
   return %2 : !out_tensor_t
 }
 
-// RUN: iree-opt %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-abi-transformation-pipeline \
-// RUN:     --iree-flow-transformation-pipeline  \
-// RUN:     --iree-stream-transformation-pipeline \
-// RUN:     --iree-hal-configuration-pipeline | \
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-lower-executable-target)))' \
-// RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-transform-dialect-library=%p/reduction_v2_codegen_spec.mlir \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
-// RUN: FileCheck %s --check-prefix=CHECK
-
 // RUN: iree-compile %s --iree-hal-target-backends=cuda \
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-transform-dialect-library=%p/reduction_v2_codegen_spec.mlir \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
+// RUN:     --iree-codegen-transform-dialect-library=%p/reduction_v2_codegen_spec.mlir@codegen | \
 // RUN: iree-run-module --module=- --function=reduce --device=cuda --input="33x1024xf32=1" |\
 // RUN: FileCheck %s --check-prefix=EXEC
 
@@ -40,39 +28,6 @@
 // RUN: iree-run-module --module=- --function=reduce --device=cuda --input="33x1024xf32=1" |\
 // RUN: FileCheck %s --check-prefix=EXEC
 
-
-  //     CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-  //     CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
-  //     CHECK-DAG: %[[F0:.*]] = arith.constant dense<0.000000e+00> : vector<4xf32>
-  //     CHECK-DAG: %[[workgroup_id_x:.*]] = hal.interface.workgroup.id[0] : index
-  //     CHECK-DAG: %[[SHMEM_ALLOC:.*]] = memref.alloc() {alignment = 64 : i64} : memref<1x128xf32, #gpu.address_space<workgroup>>
-
-  //         CHECK: %[[TIDX:.]] = gpu.thread_id  x
-  //         CHECK: %[[IDX_0:.*]] = affine.apply{{.*}}()[%[[TIDX]]]
-  //         CHECK: gpu.barrier
-  // TODO: Properly poduce/CSE IDX_1 vs IDX_0
-  //         CHECK: %[[IDX_1:.*]] = affine.apply{{.*}}(%[[TIDX]])
-  // Local per-thread scf.for-based reduction.
-  //         CHECK: scf.for
-  //         CHECK:   vector.transfer_read
-  //         CHECK:   vector.transfer_read %[[SHMEM_ALLOC]][%[[C0]], %[[IDX_1]]]
-  //         CHECK:   arith.addf %{{.*}}, %{{.*}} : vector<4xf32>
-  //         CHECK:   vector.transfer_write %{{.*}}, %[[SHMEM_ALLOC]][%[[C0]], %[[IDX_1]]]
-  // TODO: remote unnecessary barrier within the loop
-  //         CHECK:   gpu.barrier
-
-  // Distributed reduction: everyone loads then 5 xor + addf expected
-  //         CHECK: vector.transfer_read %{{.*}}[%[[C0]], %[[IDX_0]]]
-  // CHECK-COUNT-5: gpu.shuffle  xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf
-
-  //         CHECK: %[[RES:.*]] = arith.addf %{{.*}}
-
-  //         CHECK: %[[RES_VEC:.*]] = vector.broadcast %[[RES]] : f32 to vector<f32>
-  //         CHECK: %[[CONDXIS0:.*]] = arith.cmpi eq, %[[TIDX]], %[[C0]] : index
-  //         CHECK: scf.if %[[CONDXIS0]]
-  //         CHECK:   vector.transfer_write %[[RES_VEC]]
-  //         CHECK: gpu.barrier
-
 // only checking the first 6 of 33
 //      EXEC: result[0]: hal.buffer_view
 // EXEC-NEXT: 33xf32=1024 1024 1024 1024 1024 1024
diff --git a/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir b/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir
index 4fd18f0..bb9ecbf 100644
--- a/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir
@@ -103,6 +103,11 @@
     transform.iree.apply_licm %func_7 : !transform.any_op
     transform.apply_cse to %func_7 : !transform.any_op
 
+    // Annotate the exported function as already translated.
+    %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
+    transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
+
     transform.yield
   }
 } // module
diff --git a/tests/transform_dialect/cuda/reduction_v2_uneven.mlir b/tests/transform_dialect/cuda/reduction_v2_uneven.mlir
index 29b2d48..66b88cd 100644
--- a/tests/transform_dialect/cuda/reduction_v2_uneven.mlir
+++ b/tests/transform_dialect/cuda/reduction_v2_uneven.mlir
@@ -18,52 +18,12 @@
   return %2 : !out_tensor_t
 }
 
-// RUN: iree-opt %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-abi-transformation-pipeline \
-// RUN:     --iree-flow-transformation-pipeline  \
-// RUN:     --iree-stream-transformation-pipeline \
-// RUN:     --iree-hal-configuration-pipeline | \
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-lower-executable-target)))' \
-// RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-transform-dialect-library=%p/reduction_v2_codegen_spec.mlir \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
-// RUN: FileCheck %s --check-prefix=CHECK
-
 // RUN: iree-compile %s --iree-hal-target-backends=cuda \
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-transform-dialect-library=%p/reduction_v2_codegen_spec.mlir \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
+// RUN:     --iree-codegen-transform-dialect-library=%p/reduction_v2_codegen_spec.mlir@codegen | \
 // RUN: iree-run-module --module=- --function=reduce --device=cuda --input="33x34567xf32=1" |\
 // RUN: FileCheck %s --check-prefix=EXEC
 
-  //     CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-  //     CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
-  //     CHECK-DAG: %[[F0:.*]] = arith.constant dense<0.000000e+00> : vector<4xf32>
-  //     CHECK-DAG: %[[workgroup_id_x:.*]] = hal.interface.workgroup.id[0] : index
-  //     CHECK-DAG: %[[SHMEM_ALLOC:.*]] = memref.alloc() {alignment = 64 : i64} : memref<1x128xf32, #gpu.address_space<workgroup>>
-
-  //         CHECK: %[[TIDX:.]] = gpu.thread_id  x
-  //         CHECK: %[[IDX:.*]] = affine.apply{{.*}}%[[TIDX]]
-  //         CHECK: gpu.barrier
-  // Local per-thread scf.for-based reduction.
-  //         CHECK: scf.for
-  //     CHECK-NOT:   memref.alloc
-  //         CHECK:   linalg.generic
-  // TODO: remote unnecessary barrier within the loop
-  //         CHECK:   gpu.barrier
-
-  // Distributed reduction: everyone loads then 5 xor + addf expected
-  //         CHECK: vector.transfer_read %{{.*}}[%[[C0]], %[[IDX]]]
-  // CHECK-COUNT-5: gpu.shuffle  xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf
-
-  //         CHECK: %[[RES:.*]] = arith.addf %{{.*}}
-
-  //         CHECK: %[[RES_VEC:.*]] = vector.broadcast %[[RES]] : f32 to vector<f32>
-  //         CHECK: %[[CONDXIS0:.*]] = arith.cmpi eq, %[[TIDX]], %[[C0]] : index
-  //         CHECK: scf.if %[[CONDXIS0]]
-  //         CHECK:   vector.transfer_write %[[RES_VEC]]
-  //         CHECK: gpu.barrier
-
 // only checking the first 6 of 33
 //      EXEC: result[0]: hal.buffer_view
 // EXEC-NEXT: 33xf32=34567 34567 34567 34567 34567 34567
diff --git a/tests/transform_dialect/cuda/softmax.mlir b/tests/transform_dialect/cuda/softmax.mlir
index 27464db..91bbc07 100644
--- a/tests/transform_dialect/cuda/softmax.mlir
+++ b/tests/transform_dialect/cuda/softmax.mlir
@@ -1,24 +1,7 @@
-
-// RUN: iree-opt %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-abi-transformation-pipeline \
-// RUN:     --iree-flow-transformation-pipeline  \
-// RUN:     --iree-flow-dispatch-use-transform-dialect=%p/softmax_dispatch_spec.mlir \
-// RUN:     --iree-stream-transformation-pipeline \
-// RUN:     --iree-hal-configuration-pipeline | \
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-lower-executable-target)))' \
-// RUN:     --iree-codegen-transform-dialect-library=%p/softmax_codegen_spec.mlir \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen \
-// RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false | \
-// RUN: FileCheck %s --check-prefix=CHECK-SHUFFLE
-
-/// Constant JIT'ing must be disabled because the transform-dialect debug
-/// flags leak to the JIT session, which doesn't know what to do with them.
 // RUN: iree-compile %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-opt-const-expr-hoisting=false --iree-opt-const-eval=false \
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
 // RUN:     --iree-flow-dispatch-use-transform-dialect=%p/softmax_dispatch_spec.mlir \
-// RUN:     --iree-codegen-transform-dialect-library=%p/softmax_codegen_spec.mlir \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
+// RUN:     --iree-codegen-transform-dialect-library=%p/softmax_codegen_spec.mlir@codegen | \
 // RUN: iree-run-module --module=- --function=softmax --device=cuda | \
 // RUN: FileCheck %s
 
@@ -27,12 +10,6 @@
 !in_tensor_t = tensor<16x128x128xf32>
 !out_tensor_t = tensor<16x128x128xf32>
 
-// Compilation checks that shuffles are produced.
-// CHECK-SHUFFLE: vector.reduction <maximumf>
-// CHECK-SHUFFLE-COUNT-5: gpu.shuffle  xor
-// CHECK-SHUFFLE: vector.reduction <add>
-// CHECK-SHUFFLE-COUNT-5: gpu.shuffle  xor
-
 // Execution only checks that @softmax runs.
 //      CHECK: EXEC @softmax
 //      CHECK: 16x128x128xf32=[
diff --git a/tests/transform_dialect/cuda/softmax_codegen_spec.mlir b/tests/transform_dialect/cuda/softmax_codegen_spec.mlir
index 95a0374..86f4909 100644
--- a/tests/transform_dialect/cuda/softmax_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/softmax_codegen_spec.mlir
@@ -109,6 +109,11 @@
     %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 } : (!transform.any_op) -> !transform.any_op
     transform.iree.vector.warp_distribute %end_func : (!transform.any_op) -> ()
 
+    // Annotate the exported function as already translated.
+    %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
+    transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
+
     transform.yield
   }
 } // module
diff --git a/tests/transform_dialect/cuda/softmax_partial.mlir b/tests/transform_dialect/cuda/softmax_partial.mlir
index 91032cb..018ad8c 100644
--- a/tests/transform_dialect/cuda/softmax_partial.mlir
+++ b/tests/transform_dialect/cuda/softmax_partial.mlir
@@ -1,31 +1,12 @@
-
-// RUN: iree-opt %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-abi-transformation-pipeline \
-// RUN:     --iree-flow-transformation-pipeline  \
-// RUN:     --iree-stream-transformation-pipeline \
-// RUN:     --iree-hal-configuration-pipeline | \
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-lower-executable-target)))' \
-// RUN:     --iree-codegen-transform-dialect-library=%p/softmax_partial_codegen_spec.mlir \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen \
-// RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false | \
-// RUN: FileCheck %s --check-prefix=CHECK-SHUFFLE
-
 // RUN: iree-compile %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-opt-const-expr-hoisting=false --iree-opt-const-eval=false \
-/// Constant JIT'ing must be disabled because the transform-dialect debug
-/// flags leak to the JIT session, which doesn't know what to do with them.
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-transform-dialect-library=%p/softmax_partial_codegen_spec.mlir \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
+// RUN:     --iree-codegen-transform-dialect-library=%p/softmax_partial_codegen_spec.mlir@codegen | \
 // RUN: iree-run-module --module=- --function=softmax_partial --device=cuda | \
 // RUN: FileCheck %s
 
 !tmp_tensor_t = tensor<16x128xf32>
 !out_tensor_t = tensor<16x128x128xf32>
 
-// Compilation checks that shuffles are produced.
-// CHECK-SHUFFLE: gpu.shuffle  xor
-
 // Execution only checks that @softmax_partial runs.
 //      CHECK: EXEC @softmax_partial
 //      CHECK: 16x128x128xf32=[
diff --git a/tests/transform_dialect/cuda/softmax_partial_codegen_spec.mlir b/tests/transform_dialect/cuda/softmax_partial_codegen_spec.mlir
index d62558f..65ea847 100644
--- a/tests/transform_dialect/cuda/softmax_partial_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/softmax_partial_codegen_spec.mlir
@@ -92,6 +92,11 @@
       : (!transform.any_op) -> !transform.any_op
     transform.iree.vector.warp_distribute %end_func : (!transform.any_op) -> ()
 
+    // Annotate the exported function as already translated.
+    %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
+    transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
+
     transform.yield
   }
 } // module
diff --git a/tests/transform_dialect/cuda/softmax_v2.mlir b/tests/transform_dialect/cuda/softmax_v2.mlir
index 07e3c28..5ef3a90 100644
--- a/tests/transform_dialect/cuda/softmax_v2.mlir
+++ b/tests/transform_dialect/cuda/softmax_v2.mlir
@@ -1,23 +1,7 @@
-// RUN: iree-opt %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-abi-transformation-pipeline \
-// RUN:     --iree-flow-transformation-pipeline  \
-// RUN:     --iree-flow-fuse-multi-use \
-// RUN:     --iree-stream-transformation-pipeline \
-// RUN:     --iree-hal-configuration-pipeline | \
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-lower-executable-target)))' \
-// RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-transform-dialect-library=%p/softmax_v2_codegen_spec.mlir \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
-// RUN: FileCheck %s --check-prefix=CHECK-SHUFFLE
-
 // RUN: iree-compile %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-opt-const-expr-hoisting=false --iree-opt-const-eval=false \
-/// Constant JIT'ing must be disabled because the transform-dialect debug
-/// flags leak to the JIT session, which doesn't know what to do with them.
 // RUN:     --iree-flow-fuse-multi-use \
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-transform-dialect-library=%p/softmax_v2_codegen_spec.mlir \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
+// RUN:     --iree-codegen-transform-dialect-library=%p/softmax_v2_codegen_spec.mlir@codegen | \
 // RUN: iree-run-module --module=- --function=softmax --device=cuda | \
 // RUN: FileCheck %s
 
@@ -25,9 +9,6 @@
 !in_tensor_t = tensor<16x128x128xf32>
 !out_tensor_t = tensor<16x128x128xf32>
 
-// Compilation checks that shuffles are produced.
-// CHECK-SHUFFLE: gpu.shuffle  xor
-
 // Execution only checks that @softmax runs.
 //      CHECK: EXEC @softmax
 //      CHECK: 16x128x128xf32=[
diff --git a/tests/transform_dialect/cuda/softmax_v2_codegen_spec.mlir b/tests/transform_dialect/cuda/softmax_v2_codegen_spec.mlir
index 773c938..c73cbe9 100644
--- a/tests/transform_dialect/cuda/softmax_v2_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/softmax_v2_codegen_spec.mlir
@@ -138,6 +138,11 @@
     transform.iree.apply_licm %func_op_3 : !transform.any_op
     transform.apply_cse to %func_op_3 : !transform.any_op
 
+    // Annotate the exported function as already translated.
+    %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
+    transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
+
     transform.yield
   }
 } // module