Integrate LLVM at llvm/llvm-project@7d3a181c (#12047)

Co-authored-by: Diego Caballero <diegocaballero@google.com>
diff --git a/compiler/src/iree/compiler/API2/api_exports.c b/compiler/src/iree/compiler/API2/api_exports.c
index 21ad67f..7fd6604 100644
--- a/compiler/src/iree/compiler/API2/api_exports.c
+++ b/compiler/src/iree/compiler/API2/api_exports.c
@@ -144,6 +144,7 @@
 extern void mlirAttributeIsAFloat();
 extern void mlirAttributeIsAInteger();
 extern void mlirAttributeIsAIntegerSet();
+extern void mlirAttributeIsALocation();
 extern void mlirAttributeIsAOpaque();
 extern void mlirAttributeIsASparseElements();
 extern void mlirAttributeIsAStridedLayout();
@@ -355,7 +356,9 @@
 extern void mlirLocationCallSiteGet();
 extern void mlirLocationEqual();
 extern void mlirLocationFileLineColGet();
+extern void mlirLocationFromAttribute();
 extern void mlirLocationFusedGet();
+extern void mlirLocationGetAttribute();
 extern void mlirLocationGetContext();
 extern void mlirLocationNameGet();
 extern void mlirLocationPrint();
@@ -719,6 +722,7 @@
   x += (uintptr_t)&mlirAttributeIsAFloat;
   x += (uintptr_t)&mlirAttributeIsAInteger;
   x += (uintptr_t)&mlirAttributeIsAIntegerSet;
+  x += (uintptr_t)&mlirAttributeIsALocation;
   x += (uintptr_t)&mlirAttributeIsAOpaque;
   x += (uintptr_t)&mlirAttributeIsASparseElements;
   x += (uintptr_t)&mlirAttributeIsAStridedLayout;
@@ -930,7 +934,9 @@
   x += (uintptr_t)&mlirLocationCallSiteGet;
   x += (uintptr_t)&mlirLocationEqual;
   x += (uintptr_t)&mlirLocationFileLineColGet;
+  x += (uintptr_t)&mlirLocationFromAttribute;
   x += (uintptr_t)&mlirLocationFusedGet;
+  x += (uintptr_t)&mlirLocationGetAttribute;
   x += (uintptr_t)&mlirLocationGetContext;
   x += (uintptr_t)&mlirLocationNameGet;
   x += (uintptr_t)&mlirLocationPrint;
diff --git a/compiler/src/iree/compiler/API2/api_exports.def b/compiler/src/iree/compiler/API2/api_exports.def
index 4180261..e3316c9 100644
--- a/compiler/src/iree/compiler/API2/api_exports.def
+++ b/compiler/src/iree/compiler/API2/api_exports.def
@@ -136,6 +136,7 @@
   mlirAttributeIsAFloat
   mlirAttributeIsAInteger
   mlirAttributeIsAIntegerSet
+  mlirAttributeIsALocation
   mlirAttributeIsAOpaque
   mlirAttributeIsASparseElements
   mlirAttributeIsAStridedLayout
@@ -347,7 +348,9 @@
   mlirLocationCallSiteGet
   mlirLocationEqual
   mlirLocationFileLineColGet
+  mlirLocationFromAttribute
   mlirLocationFusedGet
+  mlirLocationGetAttribute
   mlirLocationGetContext
   mlirLocationNameGet
   mlirLocationPrint
diff --git a/compiler/src/iree/compiler/API2/api_exports.ld b/compiler/src/iree/compiler/API2/api_exports.ld
index 1629cea..435d025 100644
--- a/compiler/src/iree/compiler/API2/api_exports.ld
+++ b/compiler/src/iree/compiler/API2/api_exports.ld
@@ -137,6 +137,7 @@
     mlirAttributeIsAFloat;
     mlirAttributeIsAInteger;
     mlirAttributeIsAIntegerSet;
+    mlirAttributeIsALocation;
     mlirAttributeIsAOpaque;
     mlirAttributeIsASparseElements;
     mlirAttributeIsAStridedLayout;
@@ -348,7 +349,9 @@
     mlirLocationCallSiteGet;
     mlirLocationEqual;
     mlirLocationFileLineColGet;
+    mlirLocationFromAttribute;
     mlirLocationFusedGet;
+    mlirLocationGetAttribute;
     mlirLocationGetContext;
     mlirLocationNameGet;
     mlirLocationPrint;
diff --git a/compiler/src/iree/compiler/API2/api_exports.macos.lst b/compiler/src/iree/compiler/API2/api_exports.macos.lst
index 4d05006..b9786ad 100644
--- a/compiler/src/iree/compiler/API2/api_exports.macos.lst
+++ b/compiler/src/iree/compiler/API2/api_exports.macos.lst
@@ -135,6 +135,7 @@
 _mlirAttributeIsAFloat
 _mlirAttributeIsAInteger
 _mlirAttributeIsAIntegerSet
+_mlirAttributeIsALocation
 _mlirAttributeIsAOpaque
 _mlirAttributeIsASparseElements
 _mlirAttributeIsAStridedLayout
@@ -346,7 +347,9 @@
 _mlirLocationCallSiteGet
 _mlirLocationEqual
 _mlirLocationFileLineColGet
+_mlirLocationFromAttribute
 _mlirLocationFusedGet
+_mlirLocationGetAttribute
 _mlirLocationGetContext
 _mlirLocationNameGet
 _mlirLocationPrint
diff --git a/compiler/src/iree/compiler/Codegen/Common/FlattenMemRefSubspanPass.cpp b/compiler/src/iree/compiler/Codegen/Common/FlattenMemRefSubspanPass.cpp
index 14f7d9b..8e39dd0 100644
--- a/compiler/src/iree/compiler/Codegen/Common/FlattenMemRefSubspanPass.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/FlattenMemRefSubspanPass.cpp
@@ -849,7 +849,7 @@
     // Fold subviews if any new oportuinity has been created.
     RewritePatternSet foldSubviewPatterns(context);
     memref::populateFoldMemRefAliasOpPatterns(foldSubviewPatterns);
-    if (failed(applyPatternsAndFoldGreedily(getOperation()->getRegions(),
+    if (failed(applyPatternsAndFoldGreedily(getOperation(),
                                             std::move(foldSubviewPatterns)))) {
       return signalPassFailure();
     }
diff --git a/compiler/src/iree/compiler/Codegen/Common/RematerializeParallelOps.cpp b/compiler/src/iree/compiler/Codegen/Common/RematerializeParallelOps.cpp
index d140df4..5601a02 100644
--- a/compiler/src/iree/compiler/Codegen/Common/RematerializeParallelOps.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/RematerializeParallelOps.cpp
@@ -28,15 +28,15 @@
     for (OpOperand& opOperand : genericOp->getOpOperands()) {
       if (!linalg::areElementwiseOpsFusable(&opOperand)) continue;
 
-      FailureOr<Operation*> fusedOp =
+      FailureOr<linalg::ElementwiseOpFusionResult> fusedOp =
           linalg::fuseElementwiseOps(rewriter, &opOperand);
       if (succeeded(fusedOp)) {
         // Forward lowering config.
         if (auto loweringAttr = getLoweringConfig(genericOp)) {
-          setLoweringConfig(fusedOp.value(), loweringAttr);
+          setLoweringConfig(fusedOp.value().fusedOp, loweringAttr);
         }
-        auto replacements =
-            fusedOp.value()->getResults().take_back(genericOp.getNumResults());
+        auto replacements = fusedOp.value().fusedOp->getResults().take_back(
+            genericOp.getNumResults());
         rewriter.replaceOp(genericOp, replacements);
         return success();
       }
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/affinemin_canonicalization.mlir b/compiler/src/iree/compiler/Codegen/Common/test/affinemin_canonicalization.mlir
index 61fa94e..63ecadb 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/affinemin_canonicalization.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/affinemin_canonicalization.mlir
@@ -8,12 +8,11 @@
   %0 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%id1]
   %1 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%count1]
 
+  //  CHECK-DAG:   %[[C32:.*]] = arith.constant 32 : index
+  //  CHECK-DAG:   %[[C4:.*]] = arith.constant 4 : i64
   //      CHECK: scf.for
-  //      CHECK:   %[[C32:.*]] = arith.constant 32 : index
   //      CHECK:   scf.for %{{.*}} = %{{.*}} to %[[C32]]
-  // CHECK-NEXT:     %[[C4:.*]] = arith.constant 4 : index
-  // CHECK-NEXT:     %[[C4I64:.*]] = arith.index_cast %[[C4:.*]]
-  // CHECK-NEXT:     memref.store %[[C4I64]], %{{.*}}[] : memref<i64>
+  // CHECK-NEXT:     memref.store %[[C4]], %{{.*}}[] : memref<i64>
   scf.for %arg0 = %0 to %c1024 step %1 {
     %2 = affine.min affine_map<(d0) -> (32, -d0 + 1024)>(%arg0)
     %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%id2]
@@ -31,9 +30,7 @@
   //      CHECK: scf.for
   //      CHECK:   %[[MIN:.*]] = affine.min
   //      CHECK:   scf.for %{{.*}} = %{{.*}} to %[[MIN]]
-  // CHECK-NEXT:     %[[C4:.*]] = arith.constant 4 : index
-  // CHECK-NEXT:     %[[C4I64:.*]] = arith.index_cast %[[C4:.*]]
-  // CHECK-NEXT:     memref.store %[[C4I64]], %{{.*}}[] : memref<i64>
+  // CHECK-NEXT:     memref.store %[[C4]], %{{.*}}[] : memref<i64>
   scf.for %arg0 = %0 to %c1020 step %1 {
     %2 = affine.min affine_map<(d0) -> (32, -d0 + 1020)>(%arg0)
     %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%id2]
@@ -49,9 +46,7 @@
   //      CHECK: scf.for
   //      CHECK:   %[[MIN:.*]] = affine.min
   //      CHECK:   scf.parallel {{.*}} to (%[[MIN]])
-  // CHECK-NEXT:     %[[C4:.*]] = arith.constant 4 : index
-  // CHECK-NEXT:     %[[C4I64:.*]] = arith.index_cast %[[C4:.*]]
-  // CHECK-NEXT:     memref.store %[[C4I64]], %{{.*}}[] : memref<i64>
+  // CHECK-NEXT:     memref.store %[[C4]], %{{.*}}[] : memref<i64>
   scf.for %arg0 = %0 to %c1020 step %1 {
     %2 = affine.min affine_map<(d0) -> (32, -d0 + 1020)>(%arg0)
     %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%id2]
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/reductions_codegen_spec.mlir b/compiler/src/iree/compiler/Codegen/Common/test/reductions_codegen_spec.mlir
index 475b2d2..b63d491 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/reductions_codegen_spec.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/reductions_codegen_spec.mlir
@@ -17,7 +17,7 @@
   %grid_loop, %outer_tiled = transform.structured.tile_to_foreach_thread_op %fusion_root_1 tile_sizes [1]
     ( mapping = [#gpu.block<x>] )
   
-  %func = transform.structured.match ops{["func.func"]} in %arg0
+  %func = transform.structured.match ops{["func.func"]} in %arg0 : (!pdl.operation) -> !pdl.operation
   %func_1 = transform.iree.apply_patterns %func { bubble_collapse_expand }
 
   // Excessively eager canonicalization results in `fill`s being "fused" due to
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/transform_buffer_opt.mlir b/compiler/src/iree/compiler/Codegen/Common/test/transform_buffer_opt.mlir
index 7b8b3b3..ef59dd5 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/transform_buffer_opt.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/transform_buffer_opt.mlir
@@ -19,7 +19,7 @@
 ^bb0(%arg0: !pdl.operation):
   transform.sequence %arg0 : !pdl.operation failures(propagate) {
   ^bb1(%arg1: !pdl.operation):
-    %0 = transform.structured.match ops{["func.func"]} in %arg1
+    %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!pdl.operation) -> !pdl.operation
     %1 = transform.iree.apply_buffer_optimizations %0
   }
 }
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/transform_dialect_apply_pattern_op.mlir b/compiler/src/iree/compiler/Codegen/Common/test/transform_dialect_apply_pattern_op.mlir
index eb27f03..eda27bb 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/transform_dialect_apply_pattern_op.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/transform_dialect_apply_pattern_op.mlir
@@ -12,7 +12,7 @@
 ^bb0(%arg0: !pdl.operation):
   transform.sequence %arg0 : !pdl.operation failures(propagate) {
   ^bb1(%arg1: !pdl.operation):
-    %0 = transform.structured.match ops{["func.func"]} in %arg1
+    %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!pdl.operation) -> !pdl.operation
     transform.iree.apply_patterns %0 { canonicalization }
   }
 }
@@ -54,7 +54,7 @@
 
 transform.sequence failures(propagate) {
 ^bb1(%arg1: !pdl.operation):
-  %0 = transform.structured.match ops{["scf.foreach_thread"]} in %arg1
+  %0 = transform.structured.match ops{["scf.foreach_thread"]} in %arg1 : (!pdl.operation) -> !pdl.operation
   %1 = transform.cast %0 : !pdl.operation to !transform.op<"scf.foreach_thread">
   transform.iree.share_foreach_thread_operands %1 share_operands = [0] : (!transform.op<"scf.foreach_thread">) -> !transform.op<"scf.foreach_thread">
 }
@@ -86,6 +86,6 @@
 
 transform.sequence failures(propagate) {
 ^bb1(%arg1: !pdl.operation):
-  %0 = transform.structured.match ops{["func.func"]} in %arg1
+  %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!pdl.operation) -> !pdl.operation
   transform.iree.apply_patterns %0 { bubble_collapse_expand }
 }
diff --git a/compiler/src/iree/compiler/Codegen/Interfaces/BufferizationInterfaces.cpp b/compiler/src/iree/compiler/Codegen/Interfaces/BufferizationInterfaces.cpp
index 53d284c..c9d7060 100644
--- a/compiler/src/iree/compiler/Codegen/Interfaces/BufferizationInterfaces.cpp
+++ b/compiler/src/iree/compiler/Codegen/Interfaces/BufferizationInterfaces.cpp
@@ -172,8 +172,8 @@
     return false;
   }
 
-  SmallVector<OpResult> getAliasingOpResult(Operation *op, OpOperand &opOperand,
-                                            const AnalysisState &state) const {
+  bufferization::AliasingOpResultList getAliasingOpResults(
+      Operation *op, OpOperand &opOperand, const AnalysisState &state) const {
     return {};
   }
 
@@ -264,7 +264,7 @@
   SmallVector<Value> newOutputBuffers;
   for (OpResult opResult : op->getOpResults()) {
     SmallVector<OpOperand *> aliasingOpOperands =
-        analysisState.getAliasingOpOperand(opResult);
+        analysisState.getAliasingOpOperands(opResult);
     assert(aliasingOpOperands.size() == 1 && "expected 1 OpOperand");
     FailureOr<Value> resultBuffer =
         getBuffer(rewriter, aliasingOpOperands.front()->get(), options);
@@ -320,10 +320,10 @@
                                const AnalysisState &state) const {
     // Operand is written to if it has an aliasing OpResult.
     auto bufferizableOp = cast<BufferizableOpInterface>(op);
-    return !bufferizableOp.getAliasingOpResult(opOperand, state).empty();
+    return !bufferizableOp.getAliasingOpResults(opOperand, state).empty();
   }
 
-  SmallVector<OpOperand *> getAliasingOpOperand(
+  bufferization::AliasingOpOperandList getAliasingOpOperands(
       Operation *op, OpResult opResult, const AnalysisState &state) const {
     auto linalgExtOp = cast<IREE::LinalgExt::LinalgExtOp>(op);
 
@@ -331,8 +331,8 @@
     return {linalgExtOp.getOutputOperand(opResult.getResultNumber())};
   }
 
-  SmallVector<OpResult> getAliasingOpResult(Operation *op, OpOperand &opOperand,
-                                            const AnalysisState &state) const {
+  bufferization::AliasingOpResultList getAliasingOpResults(
+      Operation *op, OpOperand &opOperand, const AnalysisState &state) const {
     auto dspOp = cast<DestinationStyleOpInterface>(op);
 
     // The i-th "out" tensor may alias with the i-th OpResult.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/ConvertToLLVM.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/ConvertToLLVM.cpp
index bfdecd1..0cbac1e 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/ConvertToLLVM.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/ConvertToLLVM.cpp
@@ -473,7 +473,7 @@
   populateComplexToLLVMConversionPatterns(typeConverter, patterns);
   populateMathToLLVMConversionPatterns(typeConverter, patterns);
   memref::populateExpandStridedMetadataPatterns(patterns);
-  populateMemRefToLLVMConversionPatterns(typeConverter, patterns);
+  populateFinalizeMemRefToLLVMConversionPatterns(typeConverter, patterns);
   populateFuncToLLVMConversionPatterns(typeConverter, patterns);
   arith::populateArithToLLVMConversionPatterns(typeConverter, patterns);
   populateVectorToSCFConversionPatterns(patterns);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/transform_dialect_bufferize.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/transform_dialect_bufferize.mlir
index b94a17f..bf02a3a 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/transform_dialect_bufferize.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/transform_dialect_bufferize.mlir
@@ -37,6 +37,6 @@
 ^bb1(%variant_op: !pdl.operation):
   %variant_op_2 = transform.iree.eliminate_empty_tensors %variant_op
   %variant_op_3 = transform.iree.bufferize %variant_op_2
-  %func = transform.structured.match ops{["func.func"]} in %variant_op_3
+  %func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!pdl.operation) -> !pdl.operation
   transform.iree.erase_hal_descriptor_type_from_memref %func
 }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToNVVM.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToNVVM.cpp
index 0606edf..4bd1f00 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToNVVM.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToNVVM.cpp
@@ -112,7 +112,7 @@
       populateLLVMConversionPatterns(&getContext(), llvmPatterns, converter);
       populateMathToLLVMConversionPatterns(converter, llvmPatterns);
       memref::populateExpandStridedMetadataPatterns(llvmPatterns);
-      populateMemRefToLLVMConversionPatterns(converter, llvmPatterns);
+      populateFinalizeMemRefToLLVMConversionPatterns(converter, llvmPatterns);
       populateFuncToLLVMConversionPatterns(converter, llvmPatterns);
       cf::populateControlFlowToLLVMConversionPatterns(converter, llvmPatterns);
       arith::populateArithToLLVMConversionPatterns(converter, llvmPatterns);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp
index d2c8f77..1e393a4 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp
@@ -83,7 +83,7 @@
       populateLLVMConversionPatterns(&getContext(), llvmPatterns, converter);
       populateMathToLLVMConversionPatterns(converter, llvmPatterns);
       memref::populateExpandStridedMetadataPatterns(llvmPatterns);
-      populateMemRefToLLVMConversionPatterns(converter, llvmPatterns);
+      populateFinalizeMemRefToLLVMConversionPatterns(converter, llvmPatterns);
       populateFuncToLLVMConversionPatterns(converter, llvmPatterns);
       cf::populateControlFlowToLLVMConversionPatterns(converter, llvmPatterns);
       arith::populateArithToLLVMConversionPatterns(converter, llvmPatterns);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_bufferize.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_bufferize.mlir
index e1353af..dea32e2 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_bufferize.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_bufferize.mlir
@@ -31,7 +31,7 @@
   ^bb1(%variant_op: !pdl.operation):
     %variant_op_2 = transform.iree.eliminate_empty_tensors %variant_op
     %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op_2
-    %func = transform.structured.match ops{["func.func"]} in %variant_op_3
+    %func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!pdl.operation) -> !pdl.operation
     transform.iree.erase_hal_descriptor_type_from_memref %func
   }
 }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_bufferize_spec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_bufferize_spec.mlir
index abbb542..e4d0557 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_bufferize_spec.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_bufferize_spec.mlir
@@ -2,6 +2,6 @@
 ^bb1(%variant_op: !pdl.operation):
   %variant_op_2 = transform.iree.eliminate_empty_tensors %variant_op
   %variant_op_3 = transform.iree.bufferize %variant_op_2
-  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3
+  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!pdl.operation) -> !pdl.operation
   transform.iree.erase_hal_descriptor_type_from_memref %memref_func
 }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_foreach_to_gpu_spec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_foreach_to_gpu_spec.mlir
index 5ad61a5..3aa246f 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_foreach_to_gpu_spec.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_foreach_to_gpu_spec.mlir
@@ -1,20 +1,20 @@
 transform.structured.canonicalized_sequence failures(propagate) {
 ^bb1(%variant_op: !pdl.operation):
-  %0 = transform.structured.match ops{["linalg.fill"]} in %variant_op
+  %0 = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!pdl.operation) -> !pdl.operation
   %foreach_thread, %tiled_fill = transform.structured.tile_to_foreach_thread_op %0 num_threads [5, 1] 
   ( mapping = [#gpu.thread<y>, #gpu.thread<x>] )
 
-  %1 = transform.structured.match ops{["linalg.matmul"]} in %variant_op
+  %1 = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!pdl.operation) -> !pdl.operation
   %foreach_thread_2, %tiled_matmul = transform.structured.tile_to_foreach_thread_op %1 num_threads [7, 9]
   ( mapping = [#gpu.thread<x>, #gpu.thread<y>] )
 
   %variant_op_2 = transform.iree.eliminate_empty_tensors %variant_op
   %variant_op_3 = transform.iree.bufferize %variant_op_2
-  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3
+  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!pdl.operation) -> !pdl.operation
   transform.iree.erase_hal_descriptor_type_from_memref %memref_func
 
   // Get the function to which to apply to.
-  %2 = transform.structured.match ops{["linalg.matmul"]} in %variant_op_3
+  %2 = transform.structured.match ops{["linalg.matmul"]} in %variant_op_3 : (!pdl.operation) -> !pdl.operation
   %func = transform.get_closest_isolated_parent %2 : (!pdl.operation) -> !pdl.operation
   transform.iree.map_nested_foreach_thread_to_gpu_threads %func { workgroup_size = [10, 11]}
 }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_vector_distribution_spec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_vector_distribution_spec.mlir
index d44e05c..d52461a 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_vector_distribution_spec.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_vector_distribution_spec.mlir
@@ -1,6 +1,6 @@
 transform.structured.canonicalized_sequence failures(propagate) {
 ^bb1(%arg1: !pdl.operation):
-  %if_op = transform.structured.match ops{["scf.if"]} in %arg1
+  %if_op = transform.structured.match ops{["scf.if"]} in %arg1 : (!pdl.operation) -> !pdl.operation
   %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 }
   %isolated = transform.get_closest_isolated_parent %warp : (!pdl.operation) -> !pdl.operation
   transform.iree.vector.warp_distribute %isolated
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_vector_warp_execute_on_lane_0_spec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_vector_warp_execute_on_lane_0_spec.mlir
index e24e76b..26cb9bf 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_vector_warp_execute_on_lane_0_spec.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_vector_warp_execute_on_lane_0_spec.mlir
@@ -1,5 +1,5 @@
 transform.structured.canonicalized_sequence failures(propagate) {
 ^bb1(%arg1: !pdl.operation):
-  %if_op = transform.structured.match ops{["scf.if"]} in %arg1
+  %if_op = transform.structured.match ops{["scf.if"]} in %arg1 : (!pdl.operation) -> !pdl.operation
   transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 }
 }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_vector_to_mma.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_vector_to_mma.mlir
index e8f80fd..f182946 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_vector_to_mma.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_vector_to_mma.mlir
@@ -49,7 +49,7 @@
 }
 transform.structured.canonicalized_sequence failures(propagate) {
 ^bb1(%variant_op: !pdl.operation):
-  %func = transform.structured.match ops{["func.func"]} in %variant_op
+  %func = transform.structured.match ops{["func.func"]} in %variant_op : (!pdl.operation) -> !pdl.operation
   transform.iree.vector.vector_to_mma_conversion %func
 }
 }
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVVectorize.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVVectorize.cpp
index 5cb2191..7b779bd 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVVectorize.cpp
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVVectorize.cpp
@@ -232,8 +232,7 @@
       RewritePatternSet patterns(context);
       vector::populateVectorMultiReductionLoweringPatterns(
           patterns, vector::VectorMultiReductionLowering::InnerParallel);
-      if (failed(applyOpPatternsAndFold(reductionOps, std::move(patterns),
-                                        GreedyRewriteStrictness::AnyOp))) {
+      if (failed(applyOpPatternsAndFold(reductionOps, std::move(patterns)))) {
         funcOp.emitOpError("vector lowering failed");
         return signalPassFailure();
       }
diff --git a/compiler/src/iree/compiler/Codegen/TransformDialectStrategies/GPU/Common.cpp b/compiler/src/iree/compiler/Codegen/TransformDialectStrategies/GPU/Common.cpp
index 93b4e3d..fdaef14 100644
--- a/compiler/src/iree/compiler/Codegen/TransformDialectStrategies/GPU/Common.cpp
+++ b/compiler/src/iree/compiler/Codegen/TransformDialectStrategies/GPU/Common.cpp
@@ -158,7 +158,8 @@
   // Locally suppress failures for this op only because it doesn't cover the
   // `threadIdx.x == 0 && threadIdx.y == 0` case at the moment.
   auto sequence = b.create<SequenceOp>(
-      TypeRange(), transform::FailurePropagationMode::Suppress, variantH);
+      TypeRange(), transform::FailurePropagationMode::Suppress, variantH,
+      /*extraBindings=*/ValueRange());
   {
     OpBuilder::InsertionGuard guard(b);
     b.createBlock(&sequence.getBody(), sequence.getBody().begin(),
diff --git a/compiler/src/iree/compiler/Codegen/Transforms/AffineMinDistributedSCFCanonicalization.cpp b/compiler/src/iree/compiler/Codegen/Transforms/AffineMinDistributedSCFCanonicalization.cpp
index 34c344d..1d6cb26 100644
--- a/compiler/src/iree/compiler/Codegen/Transforms/AffineMinDistributedSCFCanonicalization.cpp
+++ b/compiler/src/iree/compiler/Codegen/Transforms/AffineMinDistributedSCFCanonicalization.cpp
@@ -191,9 +191,11 @@
 
     // Explicitly walk and apply the pattern locally to avoid more general
     // folding on the rest of the IR.
-    funcOp.walk([&frozenPatterns](AffineMinOp minOp) {
-      (void)applyOpPatternsAndFold(minOp, frozenPatterns);
+    SmallVector<Operation *> minOps;
+    funcOp.walk([&minOps](AffineMinOp minOp) {
+      minOps.push_back(minOp.getOperation());
     });
+    (void)applyOpPatternsAndFold(minOps, frozenPatterns);
   }
 };
 }  // namespace
diff --git a/compiler/src/iree/compiler/Codegen/Utils/LinkingUtils.cpp b/compiler/src/iree/compiler/Codegen/Utils/LinkingUtils.cpp
index 75ae2f3..accf77f 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/LinkingUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Utils/LinkingUtils.cpp
@@ -83,7 +83,7 @@
           // Private symbols can be safely folded into duplicates or renamed.
           if (OperationEquivalence::isEquivalentTo(
                   targetOp, op, OperationEquivalence::exactValueMatch,
-                  OperationEquivalence::exactValueMatch,
+                  /*markEquivalent=*/nullptr,
                   OperationEquivalence::Flags::IgnoreLocations)) {
             // Optimization: skip over duplicate private symbols.
             // We could let CSE do this later, but we may as well check here.
@@ -156,7 +156,7 @@
       auto oldAttr = use.getSymbolRef();
       auto newAttr = map.lookup(oldAttr);
       if (!newAttr) continue;
-      auto newDict = use.getUser()->getAttrDictionary().replaceSubElements(
+      auto newDict = use.getUser()->getAttrDictionary().replace(
           [&](Attribute attr) -> std::pair<Attribute, WalkResult> {
             if (attr == oldAttr) {
               // Found old->new replacement.
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/FusionOfTensorOps.cpp b/compiler/src/iree/compiler/Dialect/Flow/Transforms/FusionOfTensorOps.cpp
index fa08d3a..e572627 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/FusionOfTensorOps.cpp
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/FusionOfTensorOps.cpp
@@ -123,15 +123,15 @@
     consumerOp->removeAttr(getConsumerAttributeName());
     producerOp->removeAttr(getProducerAttributeName());
 
-    FailureOr<Operation *> fusedOperation =
+    FailureOr<linalg::ElementwiseOpFusionResult> fusedOperation =
         linalg::fuseElementwiseOps(rewriter, fusedOperand);
     if (failed(fusedOperation)) {
       return rewriter.notifyMatchFailure(consumerOp,
                                          "failed to fuse with producer");
     }
-    assert(fusedOperation.value()->getNumResults() ==
+    assert(fusedOperation.value().fusedOp->getNumResults() ==
            producerOp->getNumResults() + consumerOp->getNumResults());
-    auto fusedResults = fusedOperation.value()->getResults();
+    auto fusedResults = fusedOperation.value().fusedOp->getResults();
     rewriter.replaceOp(producerOp,
                        fusedResults.take_front(producerOp->getNumResults()));
     rewriter.replaceOp(consumerOp,
@@ -190,8 +190,7 @@
   RewritePatternSet fusionPatterns(context);
   fusionPatterns.insert<FuseElementwiseOpsWithMultipleUses>(context);
   linalg::GenericOp::getCanonicalizationPatterns(fusionPatterns, context);
-  if (failed(applyPatternsAndFoldGreedily(funcOp->getRegions(),
-                                          std::move(fusionPatterns)))) {
+  if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(fusionPatterns)))) {
     return funcOp->emitOpError("multi use producer -> consumer fusion failed");
   }
   return numCandidates;
@@ -285,8 +284,7 @@
 
       GreedyRewriteConfig rewriteConfig;
       rewriteConfig.maxIterations = GreedyRewriteConfig::kNoLimit;
-      if (failed(applyPatternsAndFoldGreedily(funcOp->getRegions(),
-                                              std::move(fusionPatterns),
+      if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(fusionPatterns),
                                               rewriteConfig))) {
         funcOp->emitError("failed to apply fusion patterns");
         return signalPassFailure();
@@ -327,7 +325,7 @@
       memref::populateResolveRankedShapeTypeResultDimsPatterns(
           collapsingReshapePatterns);
       if (failed(applyPatternsAndFoldGreedily(
-              funcOp->getRegions(), std::move(collapsingReshapePatterns)))) {
+              funcOp, std::move(collapsingReshapePatterns)))) {
         funcOp->emitError("failed to apply collapsing reshape patterns");
         return signalPassFailure();
       }
@@ -343,7 +341,7 @@
     {
       RewritePatternSet opFoldingPatterns(&getContext());
       tensor::populateFoldTensorEmptyPatterns(opFoldingPatterns);
-      if (failed(applyPatternsAndFoldGreedily(funcOp->getRegions(),
+      if (failed(applyPatternsAndFoldGreedily(funcOp,
                                               std::move(opFoldingPatterns)))) {
         funcOp->emitError("failed to apply op folding patterns");
         return signalPassFailure();
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/transform_dialect_dispatch_spec.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/transform_dialect_dispatch_spec.mlir
index 87315ba..96f7f09 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/transform_dialect_dispatch_spec.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/transform_dialect_dispatch_spec.mlir
@@ -1,6 +1,6 @@
 transform.structured.canonicalized_sequence failures(propagate) {
 ^bb1(%arg1: !pdl.operation):
-  %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1
+  %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!pdl.operation) -> !pdl.operation
   %foreach_op, %tiled_op = transform.structured.tile_to_foreach_thread_op %0 num_threads [42, 67]
   %dispatch_op = transform.iree.foreach_thread_to_flow %foreach_op
 }
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/transform_dispatch_region_formation.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/transform_dispatch_region_formation.mlir
index b5db680..b952164 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/transform_dispatch_region_formation.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/transform_dispatch_region_formation.mlir
@@ -17,7 +17,7 @@
 ^bb0(%arg0: !pdl.operation):
   transform.sequence %arg0 : !pdl.operation failures(propagate) {
   ^bb1(%arg1: !pdl.operation):
-    %0 = transform.structured.match ops{["tensor.extract_slice"]} in %arg1
+    %0 = transform.structured.match ops{["tensor.extract_slice"]} in %arg1 : (!pdl.operation) -> !pdl.operation
     transform.iree.wrap_in_dispatch_region %0 { generateWorkload = false }
   }
 }
@@ -48,9 +48,9 @@
 ^bb0(%arg0: !pdl.operation):
   transform.sequence %arg0 : !pdl.operation failures(propagate) {
   ^bb1(%arg1: !pdl.operation):
-    %0 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1
+    %0 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1 : (!pdl.operation) -> !pdl.operation
     %dispatch_op = transform.iree.wrap_in_dispatch_region %0 { generateWorkload = false }
-    %1 = transform.structured.match ops{["test.dummy"]} in %arg1
+    %1 = transform.structured.match ops{["test.dummy"]} in %arg1 : (!pdl.operation) -> !pdl.operation
     transform.iree.clone_preceding_op_into_dispatch_region %1 into %dispatch_op
   }
 }
@@ -81,9 +81,9 @@
 ^bb0(%arg0: !pdl.operation):
   transform.sequence %arg0 : !pdl.operation failures(propagate) {
   ^bb1(%arg1: !pdl.operation):
-    %0 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1
+    %0 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1 : (!pdl.operation) -> !pdl.operation
     %dispatch_op = transform.iree.wrap_in_dispatch_region %0 { generateWorkload = false }
-    %1 = transform.structured.match ops{["tensor.extract_slice"]} in %arg1
+    %1 = transform.structured.match ops{["tensor.extract_slice"]} in %arg1 : (!pdl.operation) -> !pdl.operation
     transform.iree.move_preceding_op_into_dispatch_region %1 into %dispatch_op
   }
 }
@@ -108,7 +108,7 @@
 ^bb0(%arg0: !pdl.operation):
   transform.sequence %arg0 : !pdl.operation failures(propagate) {
   ^bb1(%arg1: !pdl.operation):
-    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1
+    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!pdl.operation) -> !pdl.operation
     %region_op = transform.iree.wrap_in_dispatch_region %0 { generateWorkload = false }
     transform.iree.region_to_workgroups %region_op
   }
@@ -143,9 +143,9 @@
 ^bb0(%arg0: !pdl.operation):
   transform.sequence %arg0 : !pdl.operation failures(propagate) {
   ^bb1(%arg1: !pdl.operation):
-    %0 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1
+    %0 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1 : (!pdl.operation) -> !pdl.operation
     %dispatch_op = transform.iree.wrap_in_dispatch_region %0 { generateWorkload = false }
-    %1 = transform.structured.match attributes{"__tagged__"} in %arg1
+    %1 = transform.structured.match attributes{"__tagged__"} in %arg1 : (!pdl.operation) -> !pdl.operation
     transform.iree.clone_preceding_op_into_dispatch_region %1 into %dispatch_op
   }
 }
@@ -176,9 +176,9 @@
 ^bb0(%arg0: !pdl.operation):
   transform.sequence %arg0 : !pdl.operation failures(propagate) {
   ^bb1(%arg1: !pdl.operation):
-    %0 = transform.structured.match ops{["tensor.extract_slice"]} in %arg1
+    %0 = transform.structured.match ops{["tensor.extract_slice"]} in %arg1 : (!pdl.operation) -> !pdl.operation
     %dispatch_op = transform.iree.wrap_in_dispatch_region %0 { generateWorkload = false }
-    %1 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1
+    %1 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1 : (!pdl.operation) -> !pdl.operation
     transform.iree.move_succeeding_op_into_dispatch_region %1 into %dispatch_op
   }
 }
@@ -212,9 +212,9 @@
 ^bb0(%arg0: !pdl.operation):
   transform.sequence %arg0 : !pdl.operation failures(propagate) {
   ^bb1(%arg1: !pdl.operation):
-    %0 = transform.structured.match ops{["test.dummy_op"]} in %arg1
+    %0 = transform.structured.match ops{["test.dummy_op"]} in %arg1 : (!pdl.operation) -> !pdl.operation
     %dispatch_op = transform.iree.wrap_in_dispatch_region %0  {generateWorkload=false}
-    %1 = transform.structured.match attributes{"__tagged__"} in %arg1
+    %1 = transform.structured.match attributes{"__tagged__"} in %arg1 : (!pdl.operation) -> !pdl.operation
     transform.iree.move_succeeding_op_into_dispatch_region %1 into %dispatch_op
   }
 }
@@ -242,9 +242,9 @@
 ^bb0(%arg0: !pdl.operation):
   transform.sequence %arg0 : !pdl.operation failures(propagate) {
   ^bb1(%arg1: !pdl.operation):
-    %0 = transform.structured.match ops{["tensor.extract_slice"]} in %arg1
+    %0 = transform.structured.match ops{["tensor.extract_slice"]} in %arg1 : (!pdl.operation) -> !pdl.operation
     %dispatch_op = transform.iree.wrap_in_dispatch_region %0  {generateWorkload=false}
-    %1 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1
+    %1 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1 : (!pdl.operation) -> !pdl.operation
     transform.iree.clone_succeeding_op_into_dispatch_region %1 into %dispatch_op
   }
 }
@@ -278,7 +278,7 @@
 ^bb0(%arg0: !pdl.operation):
   transform.sequence %arg0 : !pdl.operation failures(propagate) {
   ^bb1(%arg1: !pdl.operation):
-    %0 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1
+    %0 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1 : (!pdl.operation) -> !pdl.operation
     %dispatch_op = transform.iree.wrap_in_dispatch_region %0 { generateWorkload = false }
   }
 }
diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/HALBase.td b/compiler/src/iree/compiler/Dialect/HAL/IR/HALBase.td
index fb51853..b0c85f2 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/HALBase.td
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/HALBase.td
@@ -13,7 +13,6 @@
 include "iree/compiler/Dialect/Util/IR/UtilTypes.td"
 include "mlir/IR/AttrTypeBase.td"
 include "mlir/IR/EnumAttr.td"
-include "mlir/IR/SubElementInterfaces.td"
 
 //===----------------------------------------------------------------------===//
 // HAL enums
@@ -625,9 +624,7 @@
 //===----------------------------------------------------------------------===//
 
 def HAL_DeviceTargetAttr :
-    AttrDef<HAL_Dialect, "DeviceTarget", [
-      SubElementAttrInterface,
-    ]> {
+    AttrDef<HAL_Dialect, "DeviceTarget"> {
   let mnemonic = "device.target";
   let summary = [{generic device target specification}];
   let description = [{
@@ -685,9 +682,7 @@
 }
 
 def HAL_ExecutableTargetAttr :
-    AttrDef<HAL_Dialect, "ExecutableTarget", [
-      DeclareAttrInterfaceMethods<SubElementAttrInterface>,
-    ]> {
+    AttrDef<HAL_Dialect, "ExecutableTarget"> {
   let mnemonic = "executable.target";
   let summary = [{generic executable target specification}];
   let description = [{
@@ -902,7 +897,6 @@
 
 def HAL_MatchAnyAttr :
     AttrDef<HAL_Dialect, "MatchAny", [
-      DeclareAttrInterfaceMethods<SubElementAttrInterface>,
       DeclareAttrInterfaceMethods<HAL_MatchAttrInterface>,
     ]> {
   let mnemonic = "match.any";
@@ -923,7 +917,6 @@
 
 def HAL_MatchAllAttr :
     AttrDef<HAL_Dialect, "MatchAll", [
-      DeclareAttrInterfaceMethods<SubElementAttrInterface>,
       DeclareAttrInterfaceMethods<HAL_MatchAttrInterface>,
     ]> {
   let mnemonic = "match.all";
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Target/LLVM/LibraryBuilder.cpp b/compiler/src/iree/compiler/Dialect/HAL/Target/LLVM/LibraryBuilder.cpp
index 2cc5806..7225a9e 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Target/LLVM/LibraryBuilder.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/Target/LLVM/LibraryBuilder.cpp
@@ -61,7 +61,10 @@
 static llvm::StructType *makeEnvironmentType(llvm::LLVMContext &context) {
   auto *type = llvm::StructType::getTypeByName(
       context, "iree_hal_executable_environment_v0_t");
-  assert(type && "environment type must be defined by ConvertToLLVM");
+  if (!type) {
+    type = llvm::StructType::create(context,
+                                    "iree_hal_executable_environment_v0_t");
+  }
   return type;
 }
 
@@ -71,7 +74,10 @@
 static llvm::StructType *makeDispatchStateType(llvm::LLVMContext &context) {
   auto *type = llvm::StructType::getTypeByName(
       context, "iree_hal_executable_dispatch_state_v0_t");
-  assert(type && "state type must be defined by ConvertToLLVM");
+  if (!type) {
+    type = llvm::StructType::create(context,
+                                    "iree_hal_executable_dispatch_state_v0_t");
+  }
   return type;
 }
 
@@ -81,7 +87,10 @@
 static llvm::StructType *makeWorkgroupStateType(llvm::LLVMContext &context) {
   auto *type = llvm::StructType::getTypeByName(
       context, "iree_hal_executable_workgroup_state_v0_t");
-  assert(type && "state type must be defined by ConvertToLLVM");
+  if (!type) {
+    type = llvm::StructType::create(context,
+                                    "iree_hal_executable_workgroup_state_v0_t");
+  }
   return type;
 }
 
diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/BUILD b/compiler/src/iree/compiler/Dialect/Stream/IR/BUILD
index 3e0fa02..ef91419 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/BUILD
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/BUILD
@@ -32,7 +32,6 @@
         "@llvm-project//mlir:InferTypeOpInterfaceTdFiles",
         "@llvm-project//mlir:OpBaseTdFiles",
         "@llvm-project//mlir:SideEffectInterfacesTdFiles",
-        "@llvm-project//mlir:SubElementInterfacesTdFiles",
         "@llvm-project//mlir:ViewLikeInterfaceTdFiles",
     ],
 )
diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamBase.td b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamBase.td
index e99fc9e..b39d6f3 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamBase.td
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamBase.td
@@ -14,7 +14,6 @@
 include "mlir/IR/AttrTypeBase.td"
 include "mlir/IR/BuiltinAttributeInterfaces.td"
 include "mlir/IR/EnumAttr.td"
-include "mlir/IR/SubElementInterfaces.td"
 
 //===----------------------------------------------------------------------===//
 // IREE stream modeling dialect
@@ -250,9 +249,7 @@
 }
 
 def Stream_PartitioningConfigAttr :
-    AttrDef<Stream_Dialect, "PartitioningConfig", [
-      SubElementAttrInterface,
-    ]> {
+    AttrDef<Stream_Dialect, "PartitioningConfig"> {
   let mnemonic = "partitioning_config";
   let summary = [{defines partitioning configuration}];
   let description = [{
diff --git a/compiler/src/iree/compiler/Dialect/Util/IR/BUILD b/compiler/src/iree/compiler/Dialect/Util/IR/BUILD
index a1c904d..8f89710 100644
--- a/compiler/src/iree/compiler/Dialect/Util/IR/BUILD
+++ b/compiler/src/iree/compiler/Dialect/Util/IR/BUILD
@@ -37,7 +37,6 @@
         "@llvm-project//mlir:InferTypeOpInterfaceTdFiles",
         "@llvm-project//mlir:OpBaseTdFiles",
         "@llvm-project//mlir:SideEffectInterfacesTdFiles",
-        "@llvm-project//mlir:SubElementInterfacesTdFiles",
         "@llvm-project//mlir:ViewLikeInterfaceTdFiles",
     ],
 )
diff --git a/compiler/src/iree/compiler/Dialect/Util/IR/UtilAttrs.td b/compiler/src/iree/compiler/Dialect/Util/IR/UtilAttrs.td
index d462c0a..e5737ad 100644
--- a/compiler/src/iree/compiler/Dialect/Util/IR/UtilAttrs.td
+++ b/compiler/src/iree/compiler/Dialect/Util/IR/UtilAttrs.td
@@ -11,7 +11,6 @@
 include "iree/compiler/Dialect/Util/IR/UtilInterfaces.td"
 include "mlir/IR/AttrTypeBase.td"
 include "mlir/IR/OpBase.td"
-include "mlir/IR/SubElementInterfaces.td"
 
 //===----------------------------------------------------------------------===//
 // Buffer attributes
@@ -33,7 +32,6 @@
 }
 
 def Util_CompositeAttr : AttrDef<Util_Dialect, "Composite", [
-  SubElementAttrInterface,
   DeclareAttrInterfaceMethods<Util_SerializableAttrInterface, [
     "serializeToBuffer",
     "serializeToStream",
diff --git a/compiler/src/iree/compiler/Dialect/Util/IR/UtilTypes.h b/compiler/src/iree/compiler/Dialect/Util/IR/UtilTypes.h
index f46be35..1915a59 100644
--- a/compiler/src/iree/compiler/Dialect/Util/IR/UtilTypes.h
+++ b/compiler/src/iree/compiler/Dialect/Util/IR/UtilTypes.h
@@ -16,7 +16,6 @@
 #include "mlir/IR/Location.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/SubElementInterfaces.h"
 #include "mlir/IR/TypeSupport.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/IR/Types.h"
diff --git a/compiler/src/iree/compiler/InputConversion/MHLO/test/mhlo_to_mhlo_preprocessing.mlir b/compiler/src/iree/compiler/InputConversion/MHLO/test/mhlo_to_mhlo_preprocessing.mlir
index ef1208a..5a0ec47 100644
--- a/compiler/src/iree/compiler/InputConversion/MHLO/test/mhlo_to_mhlo_preprocessing.mlir
+++ b/compiler/src/iree/compiler/InputConversion/MHLO/test/mhlo_to_mhlo_preprocessing.mlir
@@ -32,7 +32,7 @@
 // -----
 
 // CHECK: @reorder_broadcast_in_dim_scalar_binary(%[[ARG0:.*]]: tensor<f32>, %[[ARG1:.*]]: tensor<f32>, %[[ARG2:.*]]: tensor<i32>, %[[ARG3:.*]]: tensor<i32>)
-func.func @reorder_broadcast_in_dim_scalar_binary(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> (tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xi32>, tensor<1x8x8x64xi32>, tensor<1x8x8x64xi32>) {
+func.func @reorder_broadcast_in_dim_scalar_binary(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> (tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xi32>, tensor<1x8x8x64xi32>, tensor<1x8x8x64xi32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xi32>, tensor<1x8x8x64xi32>, tensor<1x8x8x64xi32>) {
   // CHECK: %[[ADD:.*]] = mhlo.add %[[ARG0]], %[[ARG1]] : tensor<f32>
   // CHECK: "mhlo.broadcast_in_dim"(%[[ADD]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<1x8x8x64xf32>
   // CHECK: %[[ATAN2:.*]] = mhlo.atan2 %[[ARG0]], %[[ARG1]] : tensor<f32>
@@ -49,12 +49,12 @@
   // CHECK: "mhlo.broadcast_in_dim"(%[[POW]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<1x8x8x64xf32>
   // CHECK: %[[REM:.*]] = mhlo.remainder %[[ARG0]], %[[ARG1]] : tensor<f32>
   // CHECK: "mhlo.broadcast_in_dim"(%[[REM]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<1x8x8x64xf32>
-  // CHECK: %[[SL:.*]] = mhlo.shift_left %[[ARG0]], %[[ARG1]] : tensor<f32>
-  // CHECK: "mhlo.broadcast_in_dim"(%[[SL]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<1x8x8x64xf32>
-  // CHECK: %[[SRA:.*]] = mhlo.shift_right_arithmetic %[[ARG0]], %[[ARG1]] : tensor<f32>
-  // CHECK: "mhlo.broadcast_in_dim"(%[[SRA]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<1x8x8x64xf32>
-  // CHECK: %[[SRL:.*]] = mhlo.shift_right_logical %[[ARG0]], %[[ARG1]] : tensor<f32>
-  // CHECK: "mhlo.broadcast_in_dim"(%[[SRL]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<1x8x8x64xf32>
+  // CHECK: %[[SL:.*]] = mhlo.shift_left %[[ARG2]], %[[ARG3]] : tensor<i32>
+  // CHECK: "mhlo.broadcast_in_dim"(%[[SL]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<i32>) -> tensor<1x8x8x64xi32>
+  // CHECK: %[[SRA:.*]] = mhlo.shift_right_arithmetic %[[ARG2]], %[[ARG3]] : tensor<i32>
+  // CHECK: "mhlo.broadcast_in_dim"(%[[SRA]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<i32>) -> tensor<1x8x8x64xi32>
+  // CHECK: %[[SRL:.*]] = mhlo.shift_right_logical %[[ARG2]], %[[ARG3]] : tensor<i32>
+  // CHECK: "mhlo.broadcast_in_dim"(%[[SRL]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<i32>) -> tensor<1x8x8x64xi32>
   // CHECK: %[[SUB:.*]] = mhlo.subtract %[[ARG0]], %[[ARG1]] : tensor<f32>
   // CHECK: "mhlo.broadcast_in_dim"(%[[SUB]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<1x8x8x64xf32>
   // CHECK: %[[AND:.*]] = mhlo.and %[[ARG2]], %[[ARG3]] : tensor<i32>
@@ -75,14 +75,14 @@
   %9 = mhlo.multiply %0, %1 : tensor<1x8x8x64xf32>
   %10 = mhlo.power %0, %1 : tensor<1x8x8x64xf32>
   %11 = mhlo.remainder %0, %1 : tensor<1x8x8x64xf32>
-  %12 = mhlo.shift_left %0, %1 : tensor<1x8x8x64xf32>
-  %13 = mhlo.shift_right_arithmetic %0, %1 : tensor<1x8x8x64xf32>
-  %14 = mhlo.shift_right_logical %0, %1 : tensor<1x8x8x64xf32>
+  %12 = mhlo.shift_left %2, %3 : tensor<1x8x8x64xi32>
+  %13 = mhlo.shift_right_arithmetic %2, %3 : tensor<1x8x8x64xi32>
+  %14 = mhlo.shift_right_logical %2, %3 : tensor<1x8x8x64xi32>
   %15 = mhlo.subtract %0, %1 : tensor<1x8x8x64xf32>
   %16 = mhlo.and %2, %3 : tensor<1x8x8x64xi32>
   %17 = mhlo.or %2, %3 : tensor<1x8x8x64xi32>
   %18 = mhlo.xor %2, %3 : tensor<1x8x8x64xi32>
-  return %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, %16, %17, %18 : tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xi32>, tensor<1x8x8x64xi32>, tensor<1x8x8x64xi32>
+  return %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, %16, %17, %18 : tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xi32>, tensor<1x8x8x64xi32>, tensor<1x8x8x64xi32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xi32>, tensor<1x8x8x64xi32>, tensor<1x8x8x64xi32>
 }
 
 // -----
diff --git a/compiler/src/iree/compiler/Utils/ModuleUtils.cpp b/compiler/src/iree/compiler/Utils/ModuleUtils.cpp
index 119cf53..9b78231 100644
--- a/compiler/src/iree/compiler/Utils/ModuleUtils.cpp
+++ b/compiler/src/iree/compiler/Utils/ModuleUtils.cpp
@@ -92,7 +92,7 @@
           // Private symbols can be safely folded into duplicates or renamed.
           if (OperationEquivalence::isEquivalentTo(
                   targetOp, sourceOp, OperationEquivalence::exactValueMatch,
-                  OperationEquivalence::exactValueMatch,
+                  /*markEquivalent=*/nullptr,
                   OperationEquivalence::Flags::IgnoreLocations)) {
             // Optimization: skip over duplicate private symbols.
             // We could let CSE do this later, but we may as well check here.
diff --git a/integrations/tensorflow/WORKSPACE b/integrations/tensorflow/WORKSPACE
index ed58de3..717c320 100644
--- a/integrations/tensorflow/WORKSPACE
+++ b/integrations/tensorflow/WORKSPACE
@@ -7,7 +7,7 @@
 
 load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository")
 
-TENSORFLOW_COMMIT = "cf2c76b511f21da0da9f0fae3b8ef768bf11fe99"
+TENSORFLOW_COMMIT = "c7764610dac1883e3649b572698e11a298dffdb6"
 
 git_repository(
     name = "org_tensorflow",
diff --git a/integrations/tensorflow/iree_tf_compiler/iree-import-tf-main.cpp b/integrations/tensorflow/iree_tf_compiler/iree-import-tf-main.cpp
index 235a1e4..30ac749 100644
--- a/integrations/tensorflow/iree_tf_compiler/iree-import-tf-main.cpp
+++ b/integrations/tensorflow/iree_tf_compiler/iree-import-tf-main.cpp
@@ -248,7 +248,8 @@
 
   // Run passes.
   {
-    PassManager pm(&context, PassManager::Nesting::Implicit);
+    PassManager pm(&context, module.get()->getName().getStringRef(),
+                   PassManager::Nesting::Implicit);
     applyPassManagerCLOptions(pm);
 
     if (prettifyTfDebugInfo) {
diff --git a/integrations/tensorflow/iree_tf_compiler/iree-import-tflite-main.cpp b/integrations/tensorflow/iree_tf_compiler/iree-import-tflite-main.cpp
index a17238a..2d0b6a3 100644
--- a/integrations/tensorflow/iree_tf_compiler/iree-import-tflite-main.cpp
+++ b/integrations/tensorflow/iree_tf_compiler/iree-import-tflite-main.cpp
@@ -162,7 +162,8 @@
   }
 
   // Run transformations.
-  PassManager pm(&context, PassManager::Nesting::Implicit);
+  PassManager pm(&context, module.get()->getName().getStringRef(),
+                 PassManager::Nesting::Implicit);
   applyPassManagerCLOptions(pm);
   applyDefaultTimingPassManagerCLOptions(pm);
   mlir::iree_integrations::TFL::buildTFLImportPassPipeline(pm);
diff --git a/integrations/tensorflow/iree_tf_compiler/iree-import-xla-main.cpp b/integrations/tensorflow/iree_tf_compiler/iree-import-xla-main.cpp
index 5f57245..d020532 100644
--- a/integrations/tensorflow/iree_tf_compiler/iree-import-xla-main.cpp
+++ b/integrations/tensorflow/iree_tf_compiler/iree-import-xla-main.cpp
@@ -296,7 +296,8 @@
   }
 
   // Run passes.
-  PassManager pm(&context, PassManager::Nesting::Implicit);
+  PassManager pm(&context, module.get()->getName().getStringRef(),
+                 PassManager::Nesting::Implicit);
   applyPassManagerCLOptions(pm);
   applyDefaultTimingPassManagerCLOptions(pm);
 
diff --git a/llvm-external-projects/iree-dialects/lib/Dialect/LinalgTransform/IR/StructuredTransformOpsExt.cpp b/llvm-external-projects/iree-dialects/lib/Dialect/LinalgTransform/IR/StructuredTransformOpsExt.cpp
index a49fcca..113e61c 100644
--- a/llvm-external-projects/iree-dialects/lib/Dialect/LinalgTransform/IR/StructuredTransformOpsExt.cpp
+++ b/llvm-external-projects/iree-dialects/lib/Dialect/LinalgTransform/IR/StructuredTransformOpsExt.cpp
@@ -983,7 +983,7 @@
         .enableX86Vector(getEnableX86vector())));
   // clang-format on
   pm.addNestedPass<func::FuncOp>(createConvertMathToLLVMPass());
-  pm.addPass(createMemRefToLLVMConversionPass());
+  pm.addPass(createFinalizeMemRefToLLVMConversionPass());
   if (getEnableAsync())
     pm.addPass(createConvertAsyncToLLVMPass());
   pm.addPass(createConvertFuncToLLVMPass());
diff --git a/llvm-external-projects/iree-dialects/lib/Dialect/LinalgTransform/Passes/TransformInterpreterPassBase.cpp b/llvm-external-projects/iree-dialects/lib/Dialect/LinalgTransform/Passes/TransformInterpreterPassBase.cpp
index fca84ef..87d5457 100644
--- a/llvm-external-projects/iree-dialects/lib/Dialect/LinalgTransform/Passes/TransformInterpreterPassBase.cpp
+++ b/llvm-external-projects/iree-dialects/lib/Dialect/LinalgTransform/Passes/TransformInterpreterPassBase.cpp
@@ -110,7 +110,7 @@
 #endif
     auto xform = cast<transform::TransformOpInterface>(b.clone(*transform));
     auto g = llvm::make_scope_exit([&]() { xform->erase(); });
-    if (failed(transform::applyTransforms(target, xform, options)))
+    if (failed(transform::applyTransforms(target, xform, {}, options)))
       return failure();
   }
   return success();
diff --git a/llvm-external-projects/iree-dialects/lib/Transforms/ListenerCSE.cpp b/llvm-external-projects/iree-dialects/lib/Transforms/ListenerCSE.cpp
index e9ba6d7..866b6d8 100644
--- a/llvm-external-projects/iree-dialects/lib/Transforms/ListenerCSE.cpp
+++ b/llvm-external-projects/iree-dialects/lib/Transforms/ListenerCSE.cpp
@@ -44,8 +44,7 @@
       return OperationEquivalence::isEquivalentTo(
           const_cast<Operation *>(lhsC), const_cast<Operation *>(rhsC),
           OperationEquivalence::exactValueMatch,
-          OperationEquivalence::ignoreValueEquivalence,
-          OperationEquivalence::IgnoreLocations);
+          /*markEquivalent=*/nullptr, OperationEquivalence::IgnoreLocations);
     }
 
     // If lhs or rhs does not have a single region with a single block, they
@@ -80,7 +79,8 @@
 
     // Callback to compare if operands of ops in the region of `lhs` and `rhs`
     // are equivalent.
-    auto mapOperands = [&](Value lhsValue, Value rhsValue) -> LogicalResult {
+    auto checkEquivalent = [&](Value lhsValue,
+                               Value rhsValue) -> LogicalResult {
       if (lhsValue == rhsValue)
         return success();
       if (areEquivalentValues.lookup(lhsValue) == rhsValue)
@@ -90,17 +90,15 @@
 
     // Callback to compare if results of ops in the region of `lhs` and `rhs`
     // are equivalent.
-    auto mapResults = [&](Value lhsResult, Value rhsResult) -> LogicalResult {
+    auto markEquivalent = [&](Value lhsResult, Value rhsResult) {
       if (getParent(lhsResult) == lhs && getParent(rhsResult) == rhs) {
-        auto insertion = areEquivalentValues.insert({lhsResult, rhsResult});
-        return success(insertion.first->second == rhsResult);
+        areEquivalentValues.insert({lhsResult, rhsResult});
       }
-      return success();
     };
 
     return OperationEquivalence::isEquivalentTo(
         const_cast<Operation *>(lhsC), const_cast<Operation *>(rhsC),
-        mapOperands, mapResults, OperationEquivalence::IgnoreLocations);
+        checkEquivalent, markEquivalent, OperationEquivalence::IgnoreLocations);
   }
 };
 } // namespace
diff --git a/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/foreach-thread-to-async.mlir b/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/foreach-thread-to-async.mlir
index feb6265..9e2c353 100644
--- a/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/foreach-thread-to-async.mlir
+++ b/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/foreach-thread-to-async.mlir
@@ -52,6 +52,6 @@
 
 transform.structured.canonicalized_sequence failures(propagate) {
 ^bb1(%module_op: !pdl.operation):
-  %0 = transform.structured.match ops{["scf.foreach_thread"]} in %module_op
+  %0 = transform.structured.match ops{["scf.foreach_thread"]} in %module_op : (!pdl.operation) -> !pdl.operation
   %1 = foreach_thread_to_async %0
 }
diff --git a/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/foreach-thread-to-scf-for.mlir b/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/foreach-thread-to-scf-for.mlir
index ba4f41c..a01da7e 100644
--- a/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/foreach-thread-to-scf-for.mlir
+++ b/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/foreach-thread-to-scf-for.mlir
@@ -46,6 +46,6 @@
 
 transform.structured.canonicalized_sequence failures(propagate) {
 ^bb1(%module_op: !pdl.operation):
-  %0 = transform.structured.match ops{["scf.foreach_thread"]} in %module_op
+  %0 = transform.structured.match ops{["scf.foreach_thread"]} in %module_op : (!pdl.operation) -> !pdl.operation
   %1 = foreach_thread_to_scf_for %0
 }
diff --git a/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/roundtrip.mlir b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/roundtrip.mlir
index 1b5c2f2..e4f0d12 100644
--- a/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/roundtrip.mlir
+++ b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/roundtrip.mlir
@@ -23,7 +23,7 @@
   bufferize
   // CHECK: %[[FUNC:.*]] = transform.structured.match ops{["func.func"]} in %arg0
   // CHECK: lower_vectors %[[FUNC]] {{.*}} multireduction_lowering = innerreduction
-  %6 = transform.structured.match ops{["func.func"]} in %arg0
+  %6 = transform.structured.match ops{["func.func"]} in %arg0 : (!pdl.operation) -> !pdl.operation
   transform.vector.lower_vectors %6 multireduction_lowering = "innerreduction"
   // CHECK: lower_to_llvm
   lower_to_llvm
diff --git a/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/single-tiling-full-script.mlir b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/single-tiling-full-script.mlir
index bececf5..62b962f 100644
--- a/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/single-tiling-full-script.mlir
+++ b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/single-tiling-full-script.mlir
@@ -15,13 +15,13 @@
 
 transform.structured.canonicalized_sequence failures(propagate) {
 ^bb1(%module_op: !pdl.operation):
-  %0 = transform.structured.match ops{["linalg.matmul"]} in %module_op
+  %0 = transform.structured.match ops{["linalg.matmul"]} in %module_op : (!pdl.operation) -> !pdl.operation
   %1, %loops:3 = transform.structured.tile %0 [4, 4, 4]
       : (!pdl.operation) -> (!pdl.operation, !pdl.operation, !pdl.operation, !pdl.operation)
   %2 = get_closest_isolated_parent %1 : (!pdl.operation) -> !pdl.operation
   transform.structured.vectorize %2 { vectorize_padding }
   bufferize
-  %3 = transform.structured.match ops{["func.func"]} in %module_op
+  %3 = transform.structured.match ops{["func.func"]} in %module_op : (!pdl.operation) -> !pdl.operation
   transform.vector.lower_vectors %3 multireduction_lowering = "innerreduction"
   lower_to_llvm
 }
diff --git a/tests/e2e/linalg_transform/transform_dialect_codegen_spec.mlir b/tests/e2e/linalg_transform/transform_dialect_codegen_spec.mlir
index d5f20ac..f891e49 100644
--- a/tests/e2e/linalg_transform/transform_dialect_codegen_spec.mlir
+++ b/tests/e2e/linalg_transform/transform_dialect_codegen_spec.mlir
@@ -1,6 +1,6 @@
 transform.structured.canonicalized_sequence failures(propagate) {
 ^bb1(%variant_op: !pdl.operation):
   %variant_op_2 = transform.iree.bufferize %variant_op
-  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_2
+  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_2 : (!pdl.operation) -> !pdl.operation
   transform.iree.erase_hal_descriptor_type_from_memref %memref_func
 }
diff --git a/tests/e2e/linalg_transform/transform_dialect_dispatch_spec.mlir b/tests/e2e/linalg_transform/transform_dialect_dispatch_spec.mlir
index f140aff..249a926 100644
--- a/tests/e2e/linalg_transform/transform_dialect_dispatch_spec.mlir
+++ b/tests/e2e/linalg_transform/transform_dialect_dispatch_spec.mlir
@@ -1,6 +1,6 @@
 transform.structured.canonicalized_sequence failures(propagate) {
 ^bb1(%arg1: !pdl.operation):
-  %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1
+  %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!pdl.operation) -> !pdl.operation
   %foreach_op, %tiled_op = transform.structured.tile_to_foreach_thread_op %0 num_threads [13, 33]
   %dispatch_op = transform.iree.foreach_thread_to_flow %foreach_op
 }
diff --git a/tests/transform_dialect/cpu/matmul_codegen_custom_dispatch_formation_spec.mlir b/tests/transform_dialect/cpu/matmul_codegen_custom_dispatch_formation_spec.mlir
index d49290f..3f25087 100644
--- a/tests/transform_dialect/cpu/matmul_codegen_custom_dispatch_formation_spec.mlir
+++ b/tests/transform_dialect/cpu/matmul_codegen_custom_dispatch_formation_spec.mlir
@@ -2,7 +2,7 @@
 
 transform.structured.canonicalized_sequence failures(propagate) {
 ^bb1(%variant_op: !pdl.operation):
-  %0 = transform.structured.match ops{["linalg.matmul"]} in %variant_op
+  %0 = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!pdl.operation) -> !pdl.operation
 
   %foreach_thread, %tiled_generic =
     transform.structured.tile_to_foreach_thread_op %0 num_threads [2] 
@@ -10,9 +10,9 @@
     ( mapping = [#gpu.block<x>] )
 
   %1 = transform.iree.bufferize %variant_op
-  %memref_func = transform.structured.match ops{["func.func"]} in %1
+  %memref_func = transform.structured.match ops{["func.func"]} in %1 : (!pdl.operation) -> !pdl.operation
   transform.iree.erase_hal_descriptor_type_from_memref %memref_func
 
-  %func = transform.structured.match ops{["func.func"]} in %1
+  %func = transform.structured.match ops{["func.func"]} in %1 : (!pdl.operation) -> !pdl.operation
   transform.iree.foreach_thread_to_workgroup %func
 }
diff --git a/tests/transform_dialect/cpu/matmul_codegen_default_spec.mlir b/tests/transform_dialect/cpu/matmul_codegen_default_spec.mlir
index 3efd0cb..8d33c24 100644
--- a/tests/transform_dialect/cpu/matmul_codegen_default_spec.mlir
+++ b/tests/transform_dialect/cpu/matmul_codegen_default_spec.mlir
@@ -2,7 +2,7 @@
 
 transform.structured.canonicalized_sequence failures(propagate) {
 ^bb1(%variant_op: !pdl.operation):
-  %matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op
+  %matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!pdl.operation) -> !pdl.operation
 
   // Step 1. Tile to foreach_thread with tile_sizes [2].
   // ===================================================
@@ -15,11 +15,11 @@
   // =========================================================
   %variant_op_2 = transform.iree.eliminate_empty_tensors %variant_op
   %variant_op_3 = transform.iree.bufferize %variant_op_2
-  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3
+  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!pdl.operation) -> !pdl.operation
   transform.iree.erase_hal_descriptor_type_from_memref %memref_func
 
   // Step 3. Post-bufferization mapping workgroup.
   // =========================================================
-  %func = transform.structured.match ops{["func.func"]} in %variant_op_3
+  %func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!pdl.operation) -> !pdl.operation
   transform.iree.foreach_thread_to_workgroup %func
 }
diff --git a/tests/transform_dialect/cuda/eltwise_reduction_codegen_spec.mlir b/tests/transform_dialect/cuda/eltwise_reduction_codegen_spec.mlir
index edb8708..beacf03 100644
--- a/tests/transform_dialect/cuda/eltwise_reduction_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/eltwise_reduction_codegen_spec.mlir
@@ -2,11 +2,11 @@
 
 transform.structured.canonicalized_sequence failures(propagate) {
 ^bb1(%variant_op: !pdl.operation):
-  %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op
+  %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!pdl.operation) -> !pdl.operation
 
   // Step 1. Split the reduction to get meatier (size(red) / 2)-way parallelism.
   // ===========================================================================
-  %0 = transform.structured.match ops{["linalg.generic"]} in %variant_op
+  %0 = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!pdl.operation) -> !pdl.operation
   %eltwise, %reduction = transform.split_handles %0 in [2] : (!pdl.operation) -> (!pdl.operation, !pdl.operation)
   %init_or_alloc_op, %more_parallel_fill_op, %more_parallel_op, %combiner_op =
     transform.structured.split_reduction %reduction
@@ -24,32 +24,32 @@
   // TODO: bubbling should be a proper transform op, at which point we will be
   // able to preserve the handles.
   // ===========================================================================
-  %func = transform.structured.match ops{["func.func"]} in %variant_op
+  %func = transform.structured.match ops{["func.func"]} in %variant_op : (!pdl.operation) -> !pdl.operation
   transform.iree.apply_patterns %func { bubble_collapse_expand }
-  %fills = transform.structured.match ops{["linalg.fill"]} in %variant_op
+  %fills = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!pdl.operation) -> !pdl.operation
   %fill_2, %more_parallel_fill_2 = transform.split_handles %fills in [2]
     : (!pdl.operation) -> (!pdl.operation, !pdl.operation)
-  %generics = transform.structured.match ops{["linalg.generic"]} in %variant_op
+  %generics = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!pdl.operation) -> !pdl.operation
   %expanded_eltwise, %more_parallel_2, %combiner_2 =
     transform.split_handles %generics in [3] : (!pdl.operation) -> (!pdl.operation, !pdl.operation, !pdl.operation)
-  %foreach_thread_grid_2 = transform.structured.match ops{["scf.foreach_thread"]} in %variant_op
+  %foreach_thread_grid_2 = transform.structured.match ops{["scf.foreach_thread"]} in %variant_op : (!pdl.operation) -> !pdl.operation
   %not_combiner = transform.merge_handles %fill_2, %more_parallel_fill_2, %more_parallel_2, %expanded_eltwise : !pdl.operation
   transform.structured.fuse_into_containing_op %not_combiner into %foreach_thread_grid_2
 
   // Step 3. Second level of tiling + fusion parallelizes to threads. Also
   // fuse in the leading elementwise.
   // ===========================================================================
-  %fill_1d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1xf32> in %variant_op
+  %fill_1d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1xf32> in %variant_op : (!pdl.operation) -> !pdl.operation
   %foreach_thread_block_combiner_op, %block_combiner_op =
     transform.structured.tile_to_foreach_thread_op %combiner_2 tile_sizes [1] 
     ( mapping = [#gpu.thread<z>] )
   transform.structured.fuse_into_containing_op %fill_1d into %foreach_thread_block_combiner_op
 
-  %fill_2d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1x2xf32> in %variant_op
-  %grid_more_parallel_op = transform.structured.match ops{["linalg.generic"]}
-    attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op
-  %grid_eltwise_op = transform.structured.match ops{["linalg.generic"]}
-    attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>]} in %variant_op
+  %fill_2d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1x2xf32> in %variant_op : (!pdl.operation) -> !pdl.operation
+  %grid_more_parallel_op = transform.structured.match ops{["linalg.generic"]} : (!pdl.operation) -> !pdl.operation
+    attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op : (!pdl.operation) -> !pdl.operation
+  %grid_eltwise_op = transform.structured.match ops{["linalg.generic"]} : (!pdl.operation) -> !pdl.operation
+    attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>]} in %variant_op : (!pdl.operation) -> !pdl.operation
   %foreach_thread_block_more_parallel_op, %block_more_parallel_op =
     transform.structured.tile_to_foreach_thread_op %grid_more_parallel_op tile_sizes [1, 1] 
     ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
@@ -58,7 +58,7 @@
 
   // Step 4. Rank-reduce and vectorize.
   // ===========================================================================
-  %func_1 = transform.structured.match ops{["func.func"]} in %variant_op
+  %func_1 = transform.structured.match ops{["func.func"]} in %variant_op : (!pdl.operation) -> !pdl.operation
   %func_2 = transform.iree.apply_patterns %func_1 {  rank_reducing_linalg, rank_reducing_vector }
   %func_3 = transform.structured.vectorize %func_2
 
@@ -66,12 +66,12 @@
   // ===========================================================================
   %variant_op_2 = transform.iree.eliminate_empty_tensors %variant_op
   %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op_2
-  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3
+  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!pdl.operation) -> !pdl.operation
   transform.iree.erase_hal_descriptor_type_from_memref %memref_func
 
   // Step 6. Post-bufferization mapping to blocks and threads.
   // ===========================================================================
-  %func_4 = transform.structured.match ops{["func.func"]} in %variant_op_3
+  %func_4 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!pdl.operation) -> !pdl.operation
   %func_5 = transform.iree.foreach_thread_to_workgroup %func_4
   %func_6 = transform.iree.map_nested_foreach_thread_to_gpu_threads %func_5
       { workgroup_size = [32, 2, 1] }
@@ -79,7 +79,7 @@
   // Step 7. Post-bufferization vector distribution with rank-reduction.
   // ===========================================================================
   %func_7 = transform.iree.apply_patterns %func_6 {  rank_reducing_linalg, rank_reducing_vector }
-  %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3
+  %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3 : (!pdl.operation) -> !pdl.operation
   // Don't complain about unsupported if (threadIdx.x == 0 && threadIdx.y == 0)
   // at this point.
   transform.sequence %variant_op_3 : !pdl.operation failures(suppress) {
diff --git a/tests/transform_dialect/cuda/eltwise_reduction_eltwise_codegen_spec.mlir b/tests/transform_dialect/cuda/eltwise_reduction_eltwise_codegen_spec.mlir
index dabe691..43dedf3 100644
--- a/tests/transform_dialect/cuda/eltwise_reduction_eltwise_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/eltwise_reduction_eltwise_codegen_spec.mlir
@@ -2,11 +2,11 @@
 
 transform.structured.canonicalized_sequence failures(propagate) {
 ^bb1(%variant_op: !pdl.operation):
-  %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op
+  %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!pdl.operation) -> !pdl.operation
 
   // Step 1. Split the reduction to get meatier (size(red) / 2)-way parallelism.
   // ===========================================================================
-  %0 = transform.structured.match ops{["linalg.generic"]} in %variant_op
+  %0 = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!pdl.operation) -> !pdl.operation
   %leading_eltwise, %reduction, %trailing_eltwise = transform.split_handles %0 in [3]
     : (!pdl.operation) -> (!pdl.operation, !pdl.operation, !pdl.operation)
   %init_or_alloc_op, %more_parallel_fill_op, %more_parallel_op, %combiner_op =
@@ -26,12 +26,12 @@
   // TODO: bubbling should be a proper transform op, at which point we will be
   // able to preserve the handles.
   // ===========================================================================
-  %func = transform.structured.match ops{["func.func"]} in %variant_op
+  %func = transform.structured.match ops{["func.func"]} in %variant_op : (!pdl.operation) -> !pdl.operation
   transform.iree.apply_patterns %func { bubble_collapse_expand }
-  %fills = transform.structured.match ops{["linalg.fill"]} in %variant_op
+  %fills = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!pdl.operation) -> !pdl.operation
   %fill_2, %more_parallel_fill_2 = transform.split_handles %fills in [2]
     : (!pdl.operation) -> (!pdl.operation, !pdl.operation)
-  %generics = transform.structured.match ops{["linalg.generic"]} in %variant_op
+  %generics = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!pdl.operation) -> !pdl.operation
   %expanded_eltwise, %more_parallel_2, %combiner_2, %trailing_eltwise_2 =
     transform.split_handles %generics in [4]
     : (!pdl.operation) -> (!pdl.operation, !pdl.operation, !pdl.operation, !pdl.operation)
@@ -43,20 +43,20 @@
   // Step 3. Second level of tiling + fusion parallelizes to threads. Also
   // fuse in the leading and trailing elementwise.
   // ===========================================================================
-  %fill_1d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1xf32> in %variant_op
+  %fill_1d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1xf32> in %variant_op : (!pdl.operation) -> !pdl.operation
   %foreach_thread_trailing_eltwise_op, %block_trailing_eltwise_op =
     transform.structured.tile_to_foreach_thread_op %trailing_eltwise_2 tile_sizes [1] 
     ( mapping = [#gpu.thread<z>] )
   %block_combiner_op = transform.structured.match ops{["linalg.generic"]}
-    attributes {iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op
+    attributes {iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op : (!pdl.operation) -> !pdl.operation
   %fill_and_reduction = transform.merge_handles %fill_1d, %block_combiner_op : !pdl.operation
   transform.structured.fuse_into_containing_op %fill_and_reduction into %foreach_thread_trailing_eltwise_op
 
-  %fill_2d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1x2xf32> in %variant_op
+  %fill_2d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1x2xf32> in %variant_op : (!pdl.operation) -> !pdl.operation
   %grid_more_parallel_op = transform.structured.match ops{["linalg.generic"]}
-    attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op
+    attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op : (!pdl.operation) -> !pdl.operation
   %grid_eltwise_op = transform.structured.match ops{["linalg.generic"]}
-    attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>]} in %variant_op
+    attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>]} in %variant_op : (!pdl.operation) -> !pdl.operation
   %foreach_thread_block_more_parallel_op, %block_more_parallel_op =
     transform.structured.tile_to_foreach_thread_op %grid_more_parallel_op tile_sizes [1, 1] 
     ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
@@ -65,7 +65,7 @@
 
   // Step 4. Rank-reduce and vectorize.
   // ===========================================================================
-  %func_1 = transform.structured.match ops{["func.func"]} in %variant_op
+  %func_1 = transform.structured.match ops{["func.func"]} in %variant_op : (!pdl.operation) -> !pdl.operation
   %func_2 = transform.iree.apply_patterns %func_1 {  rank_reducing_linalg, rank_reducing_vector }
   %func_3 = transform.structured.vectorize %func_2
 
@@ -73,12 +73,12 @@
   // ===========================================================================
   %variant_op_2 = transform.iree.eliminate_empty_tensors %variant_op
   %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op_2
-  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3
+  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!pdl.operation) -> !pdl.operation
   transform.iree.erase_hal_descriptor_type_from_memref %memref_func
 
   // Step 6. Post-bufferization mapping to blocks and threads.
   // ===========================================================================
-  %func_4 = transform.structured.match ops{["func.func"]} in %variant_op_3
+  %func_4 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!pdl.operation) -> !pdl.operation
   %func_5 = transform.iree.foreach_thread_to_workgroup %func_4
   %func_6 = transform.iree.map_nested_foreach_thread_to_gpu_threads %func_5
       { workgroup_size = [32, 2, 1] }
@@ -86,7 +86,7 @@
   // Step 7. Post-bufferization vector distribution with rank-reduction.
   // ===========================================================================
   %func_7 = transform.iree.apply_patterns %func_6 { rank_reducing_linalg, rank_reducing_vector, fold_memref_aliases }
-  %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3
+  %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3 : (!pdl.operation) -> !pdl.operation
   // Don't complain about unsupported if (threadIdx.x == 0 && threadIdx.y == 0)
   // at this point.
   transform.sequence %variant_op_3 : !pdl.operation failures(suppress) {
diff --git a/tests/transform_dialect/cuda/reduction_codegen_spec.mlir b/tests/transform_dialect/cuda/reduction_codegen_spec.mlir
index 9f1f5ec..a29843d 100644
--- a/tests/transform_dialect/cuda/reduction_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/reduction_codegen_spec.mlir
@@ -2,11 +2,11 @@
 
 transform.structured.canonicalized_sequence failures(propagate) {
 ^bb1(%variant_op: !pdl.operation):
-  %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op
+  %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!pdl.operation) -> !pdl.operation
 
   // Step 1. Split the reduction to get meatier (size(red) / 2)-way parallelism.
   // ===========================================================================
-  %0 = transform.structured.match ops{["linalg.generic"]} in %variant_op
+  %0 = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!pdl.operation) -> !pdl.operation
   %init_or_alloc_op, %more_parallel_fill_op, %more_parallel_op, %combiner_op =
     transform.structured.split_reduction %0
       { split_factor = 2, insert_split_dimension = 1 }
@@ -21,15 +21,15 @@
 
   // Step 3. Second level of tiling + fusion parallelizes to threads.
   // ===========================================================================
-  %fill_1d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1xf32> in %variant_op
+  %fill_1d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1xf32> in %variant_op : (!pdl.operation) -> !pdl.operation
   %foreach_thread_block_combiner_op, %block_combiner_op =
     transform.structured.tile_to_foreach_thread_op %grid_combiner_op tile_sizes [1] 
     ( mapping = [#gpu.thread<z>] )
   transform.structured.fuse_into_containing_op %fill_1d into %foreach_thread_block_combiner_op
 
-  %fill_2d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1x2xf32> in %variant_op
+  %fill_2d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1x2xf32> in %variant_op : (!pdl.operation) -> !pdl.operation
   %grid_more_parallel_op = transform.structured.match ops{["linalg.generic"]}
-    attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op
+    attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op : (!pdl.operation) -> !pdl.operation
   %foreach_thread_block_more_parallel_op, %block_more_parallel_op =
     transform.structured.tile_to_foreach_thread_op %grid_more_parallel_op tile_sizes [1, 1] 
     ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
@@ -37,7 +37,7 @@
 
   // Step 4. Rank-reduce and vectorize.
   // ===========================================================================
-  %func = transform.structured.match ops{["func.func"]} in %variant_op
+  %func = transform.structured.match ops{["func.func"]} in %variant_op : (!pdl.operation) -> !pdl.operation
   %func_2 = transform.iree.apply_patterns %func {  rank_reducing_linalg, rank_reducing_vector }
   %func_3 = transform.structured.vectorize %func_2
 
@@ -46,12 +46,12 @@
   %func_4 = transform.iree.apply_patterns %func_3 { fold_reassociative_reshapes }
   %variant_op_2 = transform.iree.eliminate_empty_tensors %variant_op
   %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op_2
-  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3
+  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!pdl.operation) -> !pdl.operation
   transform.iree.erase_hal_descriptor_type_from_memref %memref_func
 
   // Step 6. Post-bufferization mapping to blocks and threads.
   // ===========================================================================
-  %func_5 = transform.structured.match ops{["func.func"]} in %variant_op_3
+  %func_5 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!pdl.operation) -> !pdl.operation
   %func_6 = transform.iree.foreach_thread_to_workgroup %func_5
   %func_7 = transform.iree.map_nested_foreach_thread_to_gpu_threads %func_6
       { workgroup_size = [32, 2, 1] }
@@ -59,7 +59,7 @@
   // Step 7. Post-bufferization vector distribution with rank-reduction.
   // ===========================================================================
   %func_8 = transform.iree.apply_patterns %func_7 { rank_reducing_linalg, rank_reducing_vector, fold_memref_aliases }
-  %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3
+  %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3 : (!pdl.operation) -> !pdl.operation
   // Don't complain about unsupported if (threadIdx.x == 0 && threadIdx.y == 0)
   // at this point.
   transform.sequence %variant_op_3 : !pdl.operation failures(suppress) {
diff --git a/tests/transform_dialect/cuda/reduction_eltwise_codegen_spec.mlir b/tests/transform_dialect/cuda/reduction_eltwise_codegen_spec.mlir
index b3df80b..69354fb 100644
--- a/tests/transform_dialect/cuda/reduction_eltwise_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/reduction_eltwise_codegen_spec.mlir
@@ -2,11 +2,11 @@
 
 transform.structured.canonicalized_sequence failures(propagate) {
 ^bb1(%variant_op: !pdl.operation):
-  %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op
+  %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!pdl.operation) -> !pdl.operation
   
   // Step 1. Split the reduction to get meatier (size(red) / 2)-way parallelism.
   // ===========================================================================
-  %0 = transform.structured.match ops{["linalg.generic"]} in %variant_op
+  %0 = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!pdl.operation) -> !pdl.operation
   %reduction, %eltwise = transform.split_handles %0 in [2] : (!pdl.operation) -> (!pdl.operation, !pdl.operation)
   %init_or_alloc_op, %more_parallel_fill_op, %more_parallel_op, %combiner_op =
     transform.structured.split_reduction %reduction
@@ -22,18 +22,18 @@
 
   // Step 3. Second level of tiling + fusion parallelizes to threads.
   // ===========================================================================
-  %fill_1d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1xf32> in %variant_op
+  %fill_1d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1xf32> in %variant_op : (!pdl.operation) -> !pdl.operation
   %eltwise_block_loop, %eltwise_block_op =
     transform.structured.tile_to_foreach_thread_op %eltwise_grid_op tile_sizes [1]
     ( mapping = [#gpu.thread<z>] )
   %block_combiner_op = transform.structured.match ops{["linalg.generic"]}
-    attributes {iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op
+    attributes {iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op : (!pdl.operation) -> !pdl.operation
   %combined_and_fill = transform.merge_handles %fill_1d, %block_combiner_op : !pdl.operation
   transform.structured.fuse_into_containing_op %combined_and_fill into %eltwise_block_loop
 
-  %fill_2d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1x2xf32> in %variant_op
+  %fill_2d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1x2xf32> in %variant_op : (!pdl.operation) -> !pdl.operation
   %grid_more_parallel_op = transform.structured.match ops{["linalg.generic"]}
-    attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op
+    attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op : (!pdl.operation) -> !pdl.operation
   %foreach_thread_block_more_parallel_op, %block_more_parallel_op =
     transform.structured.tile_to_foreach_thread_op %grid_more_parallel_op tile_sizes [1, 1] 
     ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
@@ -41,7 +41,7 @@
 
   // Step 4. Rank-reduce and vectorize.
   // ===========================================================================
-  %func = transform.structured.match ops{["func.func"]} in %variant_op
+  %func = transform.structured.match ops{["func.func"]} in %variant_op : (!pdl.operation) -> !pdl.operation
   %func_2 = transform.iree.apply_patterns %func {  rank_reducing_linalg, rank_reducing_vector }
   %func_3 = transform.structured.vectorize %func_2
 
@@ -50,12 +50,12 @@
   %func_4 = transform.iree.apply_patterns %func_3 { fold_reassociative_reshapes }
   %variant_op_2 = transform.iree.eliminate_empty_tensors %variant_op
   %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op_2
-  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3
+  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!pdl.operation) -> !pdl.operation
   transform.iree.erase_hal_descriptor_type_from_memref %memref_func
 
   // Step 6. Post-bufferization mapping to blocks and threads.
   // ===========================================================================
-  %func_5 = transform.structured.match ops{["func.func"]} in %variant_op_3
+  %func_5 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!pdl.operation) -> !pdl.operation
   %func_6 = transform.iree.foreach_thread_to_workgroup %func_5
   %func_7 = transform.iree.map_nested_foreach_thread_to_gpu_threads %func_6
       { workgroup_size = [32, 2, 1] }
@@ -63,7 +63,7 @@
   // Step 7. Post-bufferization vector distribution with rank-reduction.
   // ===========================================================================
   %func_8 = transform.iree.apply_patterns %func_7 { rank_reducing_linalg, rank_reducing_vector, fold_memref_aliases }
-  %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3
+  %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3 : (!pdl.operation) -> !pdl.operation
   // Don't complain about unsupported if (threadIdx.x == 0 && threadIdx.y == 0)
   // at this point.
   transform.sequence %variant_op_3 : !pdl.operation failures(suppress) {
diff --git a/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir b/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir
index 73b0e56..8224a2e 100644
--- a/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir
@@ -2,8 +2,8 @@
 
 transform.structured.canonicalized_sequence failures(propagate) {
 ^bb1(%variant_op: !pdl.operation):
-  %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op
-  %reduction = transform.structured.match ops{["linalg.generic"]} in %variant_op
+  %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!pdl.operation) -> !pdl.operation
+  %reduction = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!pdl.operation) -> !pdl.operation
 
   // Step 1. First level of tiling + fusion parallelizes to blocks.
   // ===========================================================================
@@ -34,7 +34,7 @@
 
   // Step 4. Rank-reduce and vectorize.
   // ===========================================================================
-  %func = transform.structured.match ops{["func.func"]} in %variant_op
+  %func = transform.structured.match ops{["func.func"]} in %variant_op : (!pdl.operation) -> !pdl.operation
   %func_2 = transform.iree.apply_patterns %func {  rank_reducing_linalg, rank_reducing_vector }
   %func_3 = transform.structured.vectorize %func_2
 
@@ -42,15 +42,15 @@
   // ===========================================================================
   %func_4 = transform.iree.apply_patterns %func_3 { fold_reassociative_reshapes }
   %variant_op_2 = transform.iree.eliminate_empty_tensors %variant_op
-  %func_5 = transform.structured.match ops{["func.func"]} in %variant_op_2
+  %func_5 = transform.structured.match ops{["func.func"]} in %variant_op_2 : (!pdl.operation) -> !pdl.operation
   %func_6 = transform.iree.apply_patterns %func_5 { erase_unnecessary_tensor_operands }
   %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op_2
-  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3
+  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!pdl.operation) -> !pdl.operation
   transform.iree.erase_hal_descriptor_type_from_memref %memref_func
 
   // Step 6. Post-bufferization mapping to blocks and threads.
   // ===========================================================================
-  %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3
+  %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!pdl.operation) -> !pdl.operation
   %func_8 = transform.iree.foreach_thread_to_workgroup %func_7
   %func_9 = transform.iree.map_nested_foreach_thread_to_gpu_threads %func_8
       { workgroup_size = [32, 1, 1] }
@@ -58,7 +58,7 @@
   // Step 7. Post-bufferization vector distribution with rank-reduction.
   // ===========================================================================
   %func_10 = transform.iree.apply_patterns %func_9 { rank_reducing_linalg, rank_reducing_vector, fold_memref_aliases }
-  %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3
+  %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3 : (!pdl.operation) -> !pdl.operation
   %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 }
   transform.iree.vector.warp_distribute %func_10
 }
diff --git a/tests/transform_dialect/cuda/reduction_v3_codegen_spec.mlir b/tests/transform_dialect/cuda/reduction_v3_codegen_spec.mlir
index 6704bb8..867db8e 100644
--- a/tests/transform_dialect/cuda/reduction_v3_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/reduction_v3_codegen_spec.mlir
@@ -2,8 +2,8 @@
 
 transform.structured.canonicalized_sequence failures(propagate) {
 ^bb1(%variant_op: !pdl.operation):
-  %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op
-  %reduction = transform.structured.match ops{["linalg.generic"]} in %variant_op
+  %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!pdl.operation) -> !pdl.operation
+  %reduction = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!pdl.operation) -> !pdl.operation
 
   // Step 1. First level of tiling + fusion parallelizes to blocks.
   // ===========================================================================
@@ -31,7 +31,7 @@
 
   // Step 3. Rank-reduce and vectorize.
   // ===========================================================================
-  %func = transform.structured.match ops{["func.func"]} in %variant_op
+  %func = transform.structured.match ops{["func.func"]} in %variant_op : (!pdl.operation) -> !pdl.operation
   // TODO: masked vectorization on block_more_parallel_op_2 if we want 
   // vector<4> to work as intended.
   %func_2 = transform.iree.apply_patterns %func {  rank_reducing_linalg, rank_reducing_vector }
@@ -41,15 +41,15 @@
   // ===========================================================================
   %func_4 = transform.iree.apply_patterns %func_3 { fold_reassociative_reshapes }
   %variant_op_2 = transform.iree.eliminate_empty_tensors %variant_op
-  %func_5 = transform.structured.match ops{["func.func"]} in %variant_op_2
+  %func_5 = transform.structured.match ops{["func.func"]} in %variant_op_2 : (!pdl.operation) -> !pdl.operation
   %func_6 = transform.iree.apply_patterns %func_5 { erase_unnecessary_tensor_operands }
   %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op_2
-  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3
+  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!pdl.operation) -> !pdl.operation
   transform.iree.erase_hal_descriptor_type_from_memref %memref_func
 
   // Step 5. Post-bufferization mapping to blocks and threads.
   // ===========================================================================
-  %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3
+  %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!pdl.operation) -> !pdl.operation
   %func_8 = transform.iree.foreach_thread_to_workgroup %func_7
   %func_9 = transform.iree.map_nested_foreach_thread_to_gpu_threads %func_8
       { workgroup_size = [1024, 1, 1] }
@@ -57,7 +57,7 @@
   // Step 6. Post-bufferization vector distribution with rank-reduction.
   // ===========================================================================
   %func_10 = transform.iree.apply_patterns %func_9 { rank_reducing_linalg, rank_reducing_vector, fold_memref_aliases }
-  %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3
+  %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3 : (!pdl.operation) -> !pdl.operation
   %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 }
   transform.iree.vector.warp_distribute %func_10
 }
diff --git a/tests/transform_dialect/cuda/softmax_codegen_spec.mlir b/tests/transform_dialect/cuda/softmax_codegen_spec.mlir
index 6c65303..9ace54f 100644
--- a/tests/transform_dialect/cuda/softmax_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/softmax_codegen_spec.mlir
@@ -4,7 +4,7 @@
 transform.structured.canonicalized_sequence failures(propagate) {
 ^bb1(%variant_op: !pdl.operation):
   %ops = transform.structured.match ops{["linalg.fill", "linalg.generic"]}
-    in %variant_op
+    in %variant_op : (!pdl.operation) -> !pdl.operation
   %input_max_fill,
   %input_max,
   %exps_sum_fill,
@@ -41,7 +41,7 @@
   // Step 2. Second level of tiling + fusion parallelizes to threads.
   // ================================================================
   %tiled_ops = transform.structured.match ops{["linalg.fill", "linalg.generic"]}
-    in %variant_op
+    in %variant_op : (!pdl.operation) -> !pdl.operation
   %tiled_input_max_fill,
   %tiled_input_max,
   %tiled_exps_sum_fill,
@@ -70,7 +70,7 @@
 
   // Step 3. Rank-reduce and vectorize.
   // ==================================
-  %func = transform.structured.match ops{["func.func"]} in %variant_op
+  %func = transform.structured.match ops{["func.func"]} in %variant_op : (!pdl.operation) -> !pdl.operation
   %funcx = transform.iree.apply_patterns %func {  rank_reducing_linalg, rank_reducing_vector }
   transform.structured.vectorize %funcx
 
@@ -78,21 +78,21 @@
   // =========================================================
   %variant_op_2 = transform.iree.eliminate_empty_tensors %variant_op
   %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op_2
-  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3
+  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!pdl.operation) -> !pdl.operation
   transform.iree.erase_hal_descriptor_type_from_memref %memref_func
 
   // Step 5. Post-bufferization mapping to blocks and threads.
   // =========================================================
-  %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3
+  %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!pdl.operation) -> !pdl.operation
   %func_3 = transform.iree.foreach_thread_to_workgroup %func_2
   transform.iree.map_nested_foreach_thread_to_gpu_threads %func_3
     { workgroup_size = [32, 4, 1] }
 
   // Step 6. Post-bufferization vector distribution with rank-reduction.
   // ===================================================================
-  %end_func = transform.structured.match ops{["func.func"]} in %variant_op_3
+  %end_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!pdl.operation) -> !pdl.operation
   %end_func_2 = transform.iree.apply_patterns %end_func { rank_reducing_linalg, rank_reducing_vector, fold_memref_aliases }
-  %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3
+  %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3 : (!pdl.operation) -> !pdl.operation
   %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 }
   transform.iree.vector.warp_distribute %end_func_2
 }
diff --git a/tests/transform_dialect/cuda/softmax_dispatch_spec.mlir b/tests/transform_dialect/cuda/softmax_dispatch_spec.mlir
index 18282b8..efa5e5d 100644
--- a/tests/transform_dialect/cuda/softmax_dispatch_spec.mlir
+++ b/tests/transform_dialect/cuda/softmax_dispatch_spec.mlir
@@ -4,7 +4,7 @@
 transform.structured.canonicalized_sequence failures(propagate){
 ^bb1(%variant_op: !pdl.operation):
   %ops = transform.structured.match ops{["linalg.fill", "linalg.generic"]}
-    in %variant_op
+    in %variant_op : (!pdl.operation) -> !pdl.operation
 
   %input_max_fill, %input_max, %exps_sum_fill, %exps, %exps_sum, %div =
     transform.split_handles %ops in [6]
@@ -20,7 +20,7 @@
     : !pdl.operation
   %region_op_2 = transform.iree.move_preceding_op_into_dispatch_region %non_div into %region_op
 
-  %empty = transform.structured.match ops{["tensor.empty"]} in %variant_op
+  %empty = transform.structured.match ops{["tensor.empty"]} in %variant_op : (!pdl.operation) -> !pdl.operation
   %region_op_3 = transform.iree.move_preceding_op_into_dispatch_region %empty into %region_op_2
   transform.iree.region_to_workgroups %region_op_3
 }
diff --git a/tests/transform_dialect/cuda/softmax_partial_codegen_spec.mlir b/tests/transform_dialect/cuda/softmax_partial_codegen_spec.mlir
index 4b9b92f..46a4fe9 100644
--- a/tests/transform_dialect/cuda/softmax_partial_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/softmax_partial_codegen_spec.mlir
@@ -7,10 +7,10 @@
   // Step 1. First level of tiling + fusion parallelizes to blocks.
   // ==============================================================
   %root = transform.structured.match interface{LinalgOp}
-    attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>]} in %variant_op
-  %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op
+    attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>]} in %variant_op : (!pdl.operation) -> !pdl.operation
+  %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!pdl.operation) -> !pdl.operation
   %red = transform.structured.match interface{LinalgOp}
-    attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op
+    attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op : (!pdl.operation) -> !pdl.operation
   %not_root = merge_handles %fill, %red : !pdl.operation
   %foreach_thread, %tiled_generic =
     transform.iree.tile_to_foreach_thread_and_workgroup_count_region %root tile_sizes [1, 4]
@@ -19,11 +19,11 @@
 
   // Step 2. Second level of tiling + fusion parallelizes to threads.
   // ================================================================
-  %fill_linalg = transform.structured.match ops{["linalg.fill"]} in %variant_op
+  %fill_linalg = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!pdl.operation) -> !pdl.operation
   %reduction_linalg = transform.structured.match ops{["linalg.generic"]}
-    attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op
+    attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op : (!pdl.operation) -> !pdl.operation
   %parallel_linalg = transform.structured.match ops{["linalg.generic"]}
-    attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>]} in %variant_op
+    attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>]} in %variant_op : (!pdl.operation) -> !pdl.operation
   %foreach_thread_reduction, %tiled_reduction_generic =
     transform.structured.tile_to_foreach_thread_op %reduction_linalg tile_sizes [1, 1]
       ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
@@ -53,7 +53,7 @@
 
   // Step 3. Rank-reduce and vectorize.
   // ==================================
-  %func = transform.structured.match ops{["func.func"]} in %variant_op
+  %func = transform.structured.match ops{["func.func"]} in %variant_op : (!pdl.operation) -> !pdl.operation
   %funcx = transform.iree.apply_patterns %func {  rank_reducing_linalg, rank_reducing_vector }
   transform.structured.vectorize %funcx
 
@@ -61,21 +61,21 @@
   // =========================================================
   %variant_op_2 = transform.iree.eliminate_empty_tensors %variant_op
   %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op_2
-  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3
+  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!pdl.operation) -> !pdl.operation
   transform.iree.erase_hal_descriptor_type_from_memref %memref_func
 
   // Step 5. Post-bufferization mapping to blocks and threads.
   // =========================================================
-  %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3
+  %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!pdl.operation) -> !pdl.operation
   %func_3 = transform.iree.foreach_thread_to_workgroup %func_2
   transform.iree.map_nested_foreach_thread_to_gpu_threads %func_3
     { workgroup_size = [32, 4, 1] }
 
   // Step 6. Post-bufferization vector distribution with rank-reduction.
   // ===================================================================
-  %end_func = transform.structured.match ops{["func.func"]} in %variant_op_3
+  %end_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!pdl.operation) -> !pdl.operation
   %end_func_2 = transform.iree.apply_patterns %end_func { rank_reducing_linalg, rank_reducing_vector, fold_memref_aliases }
-  %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3
+  %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3 : (!pdl.operation) -> !pdl.operation
   %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 }
   transform.iree.vector.warp_distribute %end_func_2
 }
diff --git a/tests/transform_dialect/cuda/softmax_v2_codegen_spec.mlir b/tests/transform_dialect/cuda/softmax_v2_codegen_spec.mlir
index 9fe3573..8e44a16 100644
--- a/tests/transform_dialect/cuda/softmax_v2_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/softmax_v2_codegen_spec.mlir
@@ -4,7 +4,7 @@
 transform.structured.canonicalized_sequence failures(propagate) {
 ^bb1(%variant_op: !pdl.operation):
   %ops = transform.structured.match ops{["linalg.fill", "linalg.generic"]}
-    in %variant_op
+    in %variant_op : (!pdl.operation) -> !pdl.operation
   %input_max_fill,
   %input_max,
   %exps_sum_fill,
@@ -34,7 +34,7 @@
   // Step 2. Second level of tiling + fusion parallelizes to threads.
   // ================================================================
   %tiled_ops = transform.structured.match ops{["linalg.fill", "linalg.generic"]}
-    in %variant_op
+    in %variant_op : (!pdl.operation) -> !pdl.operation
   %tiled_input_max_fill,
   %tiled_input_max,
   %tiled_exps_sum_fill,
@@ -61,7 +61,7 @@
 
   // Step 3. Rank-reduce and vectorize.
   // ==================================
-  %funcx_2 = transform.structured.match ops{["func.func"]} in %variant_op
+  %funcx_2 = transform.structured.match ops{["func.func"]} in %variant_op : (!pdl.operation) -> !pdl.operation
   %funcx_3 = transform.iree.apply_patterns %funcx_2 {  rank_reducing_linalg, rank_reducing_vector }
   transform.structured.vectorize %funcx_3
 
@@ -69,21 +69,21 @@
   // =========================================================
   %variant_op_2 = transform.iree.eliminate_empty_tensors %variant_op
   %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op_2
-  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3
+  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!pdl.operation) -> !pdl.operation
   transform.iree.erase_hal_descriptor_type_from_memref %memref_func
 
   // Step 5. Post-bufferization mapping to blocks and threads.
   // =========================================================
-  %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3
+  %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!pdl.operation) -> !pdl.operation
   %func_3 = transform.iree.foreach_thread_to_workgroup %func_2
   transform.iree.map_nested_foreach_thread_to_gpu_threads %func_3
     { workgroup_size = [32, 4, 1] }
 
   // Step 6. Post-bufferization vector distribution with rank-reduction.
   // ===================================================================
-  %end_func = transform.structured.match ops{["func.func"]} in %variant_op_3
+  %end_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!pdl.operation) -> !pdl.operation
   %end_func_2 = transform.iree.apply_patterns %end_func { rank_reducing_linalg, rank_reducing_vector, fold_memref_aliases }
-  %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3
+  %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3 : (!pdl.operation) -> !pdl.operation
   %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 }
   transform.iree.vector.warp_distribute %end_func_2
 }
diff --git a/tests/transform_dialect/cuda/vecadd2d_codegen_spec.mlir b/tests/transform_dialect/cuda/vecadd2d_codegen_spec.mlir
index b193678..f55dac8 100644
--- a/tests/transform_dialect/cuda/vecadd2d_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/vecadd2d_codegen_spec.mlir
@@ -2,21 +2,21 @@
 ^bb1(%variant_op: !pdl.operation):
   // Step 1. Find three linalg.generics and tile to GPU thread blocks.
   // ===========================================================================
-  %generics = transform.structured.match ops{["linalg.generic"]} in %variant_op
+  %generics = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!pdl.operation) -> !pdl.operation
   transform.iree.tile_to_foreach_thread_and_workgroup_count_region %generics 
                   tile_sizes [5, 3] ( mapping = [#gpu.block<z>, #gpu.block<x>])
 
   // Step 2. Rank reduce and bufferize and drop HAL decriptor from memref ops.
   // ===========================================================================
-  %func = transform.structured.match ops{["func.func"]} in %variant_op
+  %func = transform.structured.match ops{["func.func"]} in %variant_op : (!pdl.operation) -> !pdl.operation
   transform.iree.apply_patterns %func {  rank_reducing_linalg, rank_reducing_vector }
   %variant_op_2 = transform.iree.eliminate_empty_tensors %variant_op
   %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op_2
-  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3
+  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!pdl.operation) -> !pdl.operation
   transform.iree.erase_hal_descriptor_type_from_memref %memref_func
 
   // Step 3. Map to GPU thread blocks.
   // ===========================================================================
-  %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3
+  %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!pdl.operation) -> !pdl.operation
   transform.iree.foreach_thread_to_workgroup %func_2
 }
diff --git a/tests/transform_dialect/cuda/vecadd2d_codegen_spec_partial_tile.mlir b/tests/transform_dialect/cuda/vecadd2d_codegen_spec_partial_tile.mlir
index 75e2dad..84836dd 100644
--- a/tests/transform_dialect/cuda/vecadd2d_codegen_spec_partial_tile.mlir
+++ b/tests/transform_dialect/cuda/vecadd2d_codegen_spec_partial_tile.mlir
@@ -1,6 +1,6 @@
 transform.structured.canonicalized_sequence failures(propagate) {
 ^bb1(%variant_op: !pdl.operation):
-  %generics = transform.structured.match ops{["linalg.generic"]} in %variant_op
+  %generics = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!pdl.operation) -> !pdl.operation
   // Tile only one dimension, skip the other one.
   transform.iree.tile_to_foreach_thread_and_workgroup_count_region %generics 
                   tile_sizes [0, 3] ( mapping = [#gpu.block<z>])
diff --git a/third_party/llvm-project b/third_party/llvm-project
index 603c286..db2aad4 160000
--- a/third_party/llvm-project
+++ b/third_party/llvm-project
@@ -1 +1 @@
-Subproject commit 603c286334b07f568d39f6706c848f576914f323
+Subproject commit db2aad4f41a2e90ecdbddd054aa752028f54d19c
diff --git a/third_party/mlir-hlo b/third_party/mlir-hlo
index fb7ef6b..b21bc81 160000
--- a/third_party/mlir-hlo
+++ b/third_party/mlir-hlo
@@ -1 +1 @@
-Subproject commit fb7ef6b8707687cdb993f121b53da656c02aface
+Subproject commit b21bc81642e5201349ff18830e9c8574c0646b16