[Codegen] Re-Enable transform dialect configuration strategy round 2 (#16427)

This time it just drops all transform dialect usage outside of transform
library file path + entry point name. This reduces code complexity in
`MaterializeUserConfigs`.

Also cleans up some of the transform dialect tests to stop lit testing
at the same time. We might want to consider dropping some of them as
they aren't being maintained (the only thing they verify at the moment
is that the transform scripts are valid for CUDA).
diff --git a/samples/transform_dialect/example_module.mlir b/samples/transform_dialect/example_module.mlir
index c5eab46..1e4ac4e 100644
--- a/samples/transform_dialect/example_module.mlir
+++ b/samples/transform_dialect/example_module.mlir
@@ -107,28 +107,29 @@
 }
 
 /// We test first with threading off so that the printers are legible.
-// R-UN: iree-compile %s --iree-hal-target-backends=vulkan \
-// R-UN:   --iree-codegen-use-transform-dialect-strategy=transform_main \
-// R-UN:   --iree-codegen-transform-dialect-library=%p/transform_library.mlir \
-// R-UN:   --compile-from=executable-sources \
-// R-UN:   --compile-to=executable-targets \
-// R-UN:   --mlir-disable-threading | \
-// R-UN: FileCheck %s --check-prefixes=CODEGEN-PRINTER
+// RUN: iree-compile %s --iree-hal-target-backends=vulkan \
+// RUN:   --iree-codegen-transform-dialect-library=%p/transform_library.mlir@kernel_config \
+// RUN:   --compile-from=executable-sources \
+// RUN:   --compile-to=executable-targets \
+// RUN:   --mlir-disable-threading | \
+// RUN: FileCheck %s --check-prefixes=CODEGEN-PRINTER
 
-// CODEGEN-PRINTER:     IR printer: Setting matmul strategy to default top-level
-// CODEGEN-PRINTER:       translation_info = #iree_codegen.translation_info<TransformDialectCodegen codegen_spec = @transform_main
+// CODEGEN-PRINTER:     IR printer: Setting matmul strategy to custom_transform_strategy
+// CODEGEN-PRINTER:       translation_info = #iree_codegen.translation_info<TransformDialectCodegen codegen_spec = @custom_transform_strategy>
 // CODEGEN-PRINTER:     IR printer: Setting reduce strategy to base vectorize top-level
 // CODEGEN-PRINTER:       translation_info = #iree_codegen.translation_info<SPIRVBaseVectorize>, workgroup_size = [16 : index, 1 : index, 1 : index]
 
 /// Then test with threading to make sure it runs
 // RUN: iree-compile %s --iree-hal-target-backends=vulkan \
-// RUN:   --iree-codegen-use-transform-dialect-strategy=@transform_main \
-// RUN:   --iree-codegen-transform-dialect-library=%p/transform_library.mlir \
+// RUN:   --iree-codegen-transform-dialect-library=%p/transform_library.mlir@kernel_config \
 // RUN:   --compile-from=executable-sources \
 // RUN:   --compile-to=executable-targets \
 // RUN:   --mlir-disable-threading | \
 // RUN: FileCheck %s --check-prefixes=CODEGEN
 
+// CODEGEN: Ran custom_transform_strategy
 // CODEGEN: spirv.func @example_module_dispatch_0_generic_80_f32
-// CODEGEN: spirv.func @example_module_dispatch_1_matmul_16x16x5_f32
+// CODEGEN: hal.executable private @example_module_dispatch_1
+// CODEGEN:   #iree_codegen.translation_info<TransformDialectCodegen codegen_spec = @custom_transform_strategy>
+// CODEGEN:     spirv.func @example_module_dispatch_1_matmul_16x16x5_f32
 // CODEGEN: spirv.func @example_module_dispatch_2_generic_16x16_f32
diff --git a/samples/transform_dialect/transform_library.mlir b/samples/transform_dialect/transform_library.mlir
index 3bb75ad..8b17af7 100644
--- a/samples/transform_dialect/transform_library.mlir
+++ b/samples/transform_dialect/transform_library.mlir
@@ -1,13 +1,76 @@
 module attributes { transform.with_named_sequence } {
-  // Print and send it down normal IREE codegen.
-  transform.named_sequence @custom_matmul(%matmul: !transform.any_op {transform.consumed}) {  
-    %1 = transform.structured.generalize %matmul : (!transform.any_op) -> !transform.any_op
-    transform.print {name = "Setting matmul strategy to default"}
+  // Example of a custom matmul strategy. The target matmul is annotated with
+  // the name of this strategy down below before strategy selection, overriding
+  // default IREE codegen.
+  transform.named_sequence @custom_transform_strategy(
+      %variant_op: !transform.any_op {transform.consumed}) {
+    // Step 1. Re-match the matmul
+    // ===========================================================================
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+
+    // Step 2. Tile to grid
+    // ===========================================================================
+    %grid_reduction, %forall_grid =
+    transform.structured.tile_using_forall %matmul tile_sizes [16, 16] ( mapping = [#gpu.block<x>, #gpu.block<y>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
+
+    // Step 3. Vectorize
+    // ===========================================================================
+    %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+      transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+      transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+    } : !transform.any_op
+    %func_1 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
+
+    // Step 4. Bufferize
+    // ===========================================================================
+    transform.apply_patterns to %func_1 {
+      transform.apply_patterns.iree.fold_fill_into_pad
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.scf.for_loop_canonicalization
+    } : !transform.any_op
+    transform.apply_patterns to %func_1 {
+      transform.apply_patterns.tensor.reassociative_reshape_folding
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.apply_cse to %func_1 : !transform.any_op
+    transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+    transform.apply_patterns to %func_1 {
+      transform.apply_patterns.linalg.erase_unnecessary_inputs
+    } : !transform.any_op
+    %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
+    %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+
+    // Step 6. Post-bufferization vector distribution
+    // ===========================================================================
+    %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
+    transform.iree.map_nested_forall_to_gpu_threads %func_7
+        workgroup_dims = [4, 8, 1] : (!transform.any_op) -> ()
+
+    // Step 7. Do layout analysis and lower to mma
+    // ===========================================================================
+    %func_10 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %func_11 = transform.iree.layout_analysis_and_distribution %func_10 : (!transform.any_op) -> (!transform.any_op)
+    transform.print {name = "Ran custom_transform_strategy"}
     transform.yield
   }
 
-  // Send it down subgroup reduce.
-  transform.named_sequence @use_subgroup_reduce(%reduce: !transform.any_op {transform.readonly}) {  
+  // Send it down a custom transform dialect pipeline.
+  transform.named_sequence @custom_matmul(%matmul: !transform.any_op {transform.readonly}) {
+    %variant_op = transform.get_parent_op %matmul {op_name = "hal.executable.variant"} : (!transform.any_op) -> !transform.any_op
+    %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    %subgroup_reduce = transform.param.constant #iree_codegen.translation_info<TransformDialectCodegen
+                                                                               codegen_spec = @custom_transform_strategy> -> !transform.any_param
+    transform.annotate %exports "translation_info" = %subgroup_reduce : !transform.any_op, !transform.any_param
+    transform.print {name = "Setting matmul strategy to custom_transform_strategy"}
+    transform.yield
+  }
+
+  // Send it down subgroup reduce with a custom tiling configuration.
+  transform.named_sequence @use_base_vectorize(%reduce: !transform.any_op {transform.readonly}) {
     %variant_op = transform.get_parent_op %reduce {op_name = "hal.executable.variant"} : (!transform.any_op) -> !transform.any_op
     %lowering_config = transform.param.constant #iree_codegen.lowering_config<tile_sizes = [[8, 0], [1, 0], [0, 0, 4]]> -> !transform.any_param
     transform.annotate %reduce "lowering_config" = %lowering_config : !transform.any_op, !transform.any_param
@@ -42,10 +105,34 @@
     transform.yield %matched : !transform.any_op
   }
 
-  transform.named_sequence @transform_main(%variant_op: !transform.any_op {transform.consumed}) {  
+  // An example of a custom transform dialect based kernel config. Note that
+  // because of the way `transform.foreach_match` works, the callback cannot
+  // manipulate IR beyond the op *given* to the matcher, as foreach_match will
+  // attempt to keep walking the IR even after a successful match. The expected
+  // flow for a strategy like this is as follows:
+  //
+  // Author an entry point like this (@kernel_config) that walks the IR and
+  // attempts to annotate the dispatch with the codegen strategy to use, i.e.
+  //   transform.foreach_match in %variant_op
+  //       @matcher_0 -> @annotator_0,
+  //       @matcher_1 -> @annotator_1,
+  //       ...
+  //
+  // the annotators should attach an #iree_codegen.translation_info attribute
+  // to the `hal.executable.export` ops within the variant as well as any
+  // relevant op specific tile sizes (and other important attributes like
+  // workgroup_size and subgroup_size, if relevant). This will then get handed
+  // off to backend specific kernel config, which will let these user configs
+  // pass through unperturbed.
+  //
+  // To couple this with a transform dialect based codegen strategy, the target
+  // codegen strategy can be included inline with this library and relevant ops
+  // can be annotated with `TransformDialectCodegen` as the lowering pipeline,
+  // with a reference to the strategy to use (see an example above).
+  transform.named_sequence @kernel_config(%variant_op: !transform.any_op {transform.consumed}) {
     transform.foreach_match in %variant_op
         @match_matmul -> @custom_matmul,
-        @match_reduce -> @use_subgroup_reduce
+        @match_reduce -> @use_base_vectorize
       : (!transform.any_op) -> (!transform.any_op)
     transform.yield
   }