Integrate llvm-project at e9c9ee9fe694067ee96643d05d6ac378349386bb (#8585)

* Integrate llvm-project at e9c9ee9fe694067ee96643d05d6ac378349386bb

* Reset third_party/llvm-project: e9c9ee9fe694067ee96643d05d6ac378349386bb (2022-03-15 21:51:12 +0000): [libc][NFC] Fix typos and reduntent code triggering compiler warinings.

* Move MHLO and TF to matching commits

TF: 05f17fca35623f4ab6d275ed95f0e1363c939f73
MHLO: 57288f12595a2ee0488806672a42da59b1e56e13
Piper CL: 435187843

* Fixes for bump LLVM @5e8700ce8bf58bdf0a59eef99c85185a74177555

* Remove uses of `verifier`.

* Fix verification methods after signature change of custom verify methods.

* Fixup fallout from bufferization changes

https://reviews.llvm.org/D121361
https://reviews.llvm.org/D121519

* Fix verifiers of Flow and VM ops.

* Fix lit test.

* Update iree-dialects in integrations.

Co-authored-by: Nicolas Vasilache <ntv@google.com>
Co-authored-by: Stella Laurenzo <stellaraccident@gmail.com>
diff --git a/integrations/tensorflow/WORKSPACE b/integrations/tensorflow/WORKSPACE
index 410fd2f..b279469 100644
--- a/integrations/tensorflow/WORKSPACE
+++ b/integrations/tensorflow/WORKSPACE
@@ -7,7 +7,7 @@
 
 load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository")
 
-TENSORFLOW_COMMIT = "fe3fd49d08db3174730123cbab2fed8bbec9cf1b"
+TENSORFLOW_COMMIT = "05f17fca35623f4ab6d275ed95f0e1363c939f73"
 
 git_repository(
     name = "org_tensorflow",
diff --git a/integrations/tensorflow/iree-dialects/BUILD b/integrations/tensorflow/iree-dialects/BUILD
index 2ce6051..350a209 100644
--- a/integrations/tensorflow/iree-dialects/BUILD
+++ b/integrations/tensorflow/iree-dialects/BUILD
@@ -32,7 +32,7 @@
     srcs = glob([
         "include/iree-dialects/Dialect/Input/*.td",
         "include/iree-dialects/Dialect/LinalgExt/IR/*.td",
-        "include/iree-dialects/Dialect/LinalgExt/Transforms/*.td",
+        "include/iree-dialects/Dialect/LinalgExt/Passes/*.td",
         "include/iree-dialects/Dialect/PyDM/IR/*.td",
         "include/iree-dialects/Dialect/PyDM/Transforms/*.td",
     ]),
@@ -43,7 +43,7 @@
     srcs = glob([
         "include/iree-dialects/Dialect/Input/*.td",
         "include/iree-dialects/Dialect/LinalgExt/IR/*.td",
-        "include/iree-dialects/Dialect/LinalgExt/Transforms/*.td",
+        "include/iree-dialects/Dialect/LinalgExt/Passes/*.td",
         "include/iree-dialects/Dialect/PyDM/IR/*.td",
         "python/iree/compiler/dialects/*.td",
     ]) + [
@@ -175,6 +175,8 @@
         ":TdFiles",
         "@llvm-project//mlir:CallInterfacesTdFiles",
         "@llvm-project//mlir:ControlFlowInterfacesTdFiles",
+        "@llvm-project//mlir:TilingInterfaceTdFiles",
+        "@llvm-project//mlir:ViewLikeInterfaceTdFiles",
     ],
 )
 
@@ -232,19 +234,19 @@
     tbl_outs = [
         (
             ["-gen-pass-decls"],
-            "include/iree-dialects/Dialect/LinalgExt/Transforms/Passes.h.inc",
+            "include/iree-dialects/Dialect/LinalgExt/Passes/Passes.h.inc",
         ),
         (
             ["-gen-pass-capi-header"],
-            "include/iree-dialects/Dialect/LinalgExt/Transforms/Passes.capi.h.inc",
+            "include/iree-dialects/Dialect/LinalgExt/Passes/Passes.capi.h.inc",
         ),
         (
             ["-gen-pass-capi-impl"],
-            "include/iree-dialects/Dialect/LinalgExt/Transforms/Passes.capi.cpp.inc",
+            "include/iree-dialects/Dialect/LinalgExt/Passes/Passes.capi.cpp.inc",
         ),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "include/iree-dialects/Dialect/LinalgExt/Transforms/Passes.td",
+    td_file = "include/iree-dialects/Dialect/LinalgExt/Passes/Passes.td",
     deps = [
         ":TdFiles",
         "@llvm-project//mlir:PassBaseTdFiles",
@@ -286,12 +288,12 @@
 )
 
 cc_library(
-    name = "IREELinalgExtTransforms",
+    name = "IREELinalgExtPasses",
     srcs = glob([
-        "lib/Dialect/LinalgExt/Transforms/*.cpp",
+        "lib/Dialect/LinalgExt/Passes/*.cpp",
     ]),
     hdrs = glob([
-        "include/iree-dialects/Dialect/LinalgExt/Transforms/*.h",
+        "include/iree-dialects/Dialect/LinalgExt/Passes/*.h",
     ]),
     deps = [
         ":IREEInputDialect",
@@ -502,6 +504,7 @@
     includes = ["include"],
     deps = [
         ":IREEInputDialect",
+        ":IREELinalgExtDialect",
         ":IREEPyDMDialect",
         ":IREEPyDMTransforms",
         "@llvm-project//mlir:CAPIIR",
@@ -523,11 +526,12 @@
     deps = [
         ":IREEInputDialect",
         ":IREELinalgExtDialect",
-        ":IREELinalgExtTransforms",
+        ":IREELinalgExtPasses",
         ":IREEPyDMDialect",
         ":IREEPyDMTransforms",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithmeticDialect",
+        "@llvm-project//mlir:ControlFlowOps",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LinalgOps",
diff --git a/integrations/tensorflow/iree-dialects/include/iree-dialects-c/Dialects.h b/integrations/tensorflow/iree-dialects/include/iree-dialects-c/Dialects.h
index 5b5d93d..eb6276b 100644
--- a/integrations/tensorflow/iree-dialects/include/iree-dialects-c/Dialects.h
+++ b/integrations/tensorflow/iree-dialects/include/iree-dialects-c/Dialects.h
@@ -21,6 +21,12 @@
 
 MLIR_DECLARE_CAPI_DIALECT_REGISTRATION(IREEInput, iree_input);
 
+//===--------------------------------------------------------------------===//
+// IREELinalgExt
+//===--------------------------------------------------------------------===//
+
+MLIR_DECLARE_CAPI_DIALECT_REGISTRATION(IREELinalgExt, iree_linalg_ext);
+
 //===----------------------------------------------------------------------===//
 // IREEPyDMDialect
 //===----------------------------------------------------------------------===//
diff --git a/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/Input/InputBase.td b/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/Input/InputBase.td
index 3990bd8..0d6565d 100644
--- a/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/Input/InputBase.td
+++ b/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/Input/InputBase.td
@@ -8,6 +8,7 @@
 #define IREE_DIALECTS_DIALECT_INPUT_BASE_TD
 
 include "mlir/IR/OpBase.td"
+include "mlir/IR/AttrTypeBase.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 
 def IREEInput_Dialect : Dialect {
diff --git a/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/Input/InputDialect.td b/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/Input/InputDialect.td
index cde0652..a60a1c6 100644
--- a/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/Input/InputDialect.td
+++ b/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/Input/InputDialect.td
@@ -61,17 +61,7 @@
 
   let parameters = (ins IREEInput_ElementTypeParameter:$elementType);
 
-  let printer = [{
-    $_printer << "<" << getElementType() << ">";
-  }];
-
-  let parser = [{
-    Type elementType;
-    if ($_parser.parseLess() || $_parser.parseType(elementType) ||
-        $_parser.parseGreater())
-      return Type();
-    return get($_ctxt, elementType);
-  }];
+  let hasCustomAssemblyFormat = 1;
 }
 
 def IREEInput_PtrType : IREEInput_Type<"Ptr"> {
@@ -80,17 +70,7 @@
   let summary = "Pointer to a concrete type";
   let parameters = (ins IREEInput_PtrTargetTypeParameter:$targetType);
 
-  let printer = [{
-    $_printer << "<" << getTargetType() << ">";
-  }];
-
-  let parser = [{
-    Type targetType;
-    if ($_parser.parseLess() || $_parser.parseType(targetType) ||
-        $_parser.parseGreater())
-      return Type();
-    return get($_ctxt, targetType);
-  }];
+  let hasCustomAssemblyFormat = 1;
 }
 
 #endif // IREE_DIALECTS_DIALECT_INPUT_DIALECT_TD
diff --git a/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/CMakeLists.txt b/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/CMakeLists.txt
index 9f57627..5a7289b 100644
--- a/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/CMakeLists.txt
+++ b/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/CMakeLists.txt
@@ -1,2 +1,2 @@
 add_subdirectory(IR)
-add_subdirectory(Transforms)
+add_subdirectory(Passes)
diff --git a/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/IR/LinalgExtInterfaces.td b/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/IR/LinalgExtInterfaces.td
index 638d4ed..4ae75cc 100644
--- a/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/IR/LinalgExtInterfaces.td
+++ b/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/IR/LinalgExtInterfaces.td
@@ -284,30 +284,6 @@
     >,
     InterfaceMethod<
       /*desc=*/[{
-        Return true if the payload uses the value loaded from `opOperand`. This
-        is useful to avoid loading from "write-only" memory that may be
-        uninitialized, as well as properly cloning "read-write" operands.
-      }],
-      /*retTy=*/"bool",
-      /*methodName=*/"payloadUsesValueFromOperand",
-      /*args=*/(ins "OpOperand *":$opOperand),
-      /*methodBody=*/"",
-      /*defaultImplementation=*/[{
-        unsigned bbArgNumber = opOperand->getOperandNumber();
-        // Safeguard against the named linalg ops that are manually defined and
-        // that only support buffer semantics: we should not be there.
-        // Such ops have an empty regionBuilder and are not constructed with a
-        // region for now. In the future they are slated to disappear.
-        assert(this->getOperation()->getNumRegions() == 1 && "unexpected "
-               "missing region (calling `payloadUsesValueFromOperand` on "
-               "manually defined named Linalg op?)");
-        Block &block = this->getOperation()->getRegion(0).front();
-        // Init tensors have uses.
-        return !block.getArgument(bbArgNumber).use_empty();
-      }]
-    >,
-    InterfaceMethod<
-      /*desc=*/[{
         Return true if `opOperand` is an input tensor.
       }],
       /*retTy=*/"bool",
@@ -340,21 +316,6 @@
     >,
     InterfaceMethod<
       /*desc=*/[{
-        Return true if `opOperand` is an init tensor. This is true when it is
-        an output tensor operand whose value is used in the payload region.
-      }],
-      /*retTy=*/"bool",
-      /*methodName=*/"isInitTensor",
-      /*args=*/(ins "OpOperand *":$opOperand),
-      /*methodBody=*/"",
-      /*defaultImplementation=*/[{
-        if (!$_op.isOutputTensor(opOperand))
-          return false;
-        return payloadUsesValueFromOperand(opOperand);
-      }]
-    >,
-    InterfaceMethod<
-      /*desc=*/[{
         Return the `opOperand` rank or zero for scalars.
       }],
       /*retTy=*/"int64_t",
diff --git a/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.td b/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.td
index 228c357..8b7bd97 100644
--- a/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.td
+++ b/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.td
@@ -10,8 +10,11 @@
 include "iree-dialects/Dialect/LinalgExt/IR/LinalgExtBase.td"
 include "iree-dialects/Dialect/LinalgExt/IR/LinalgExtInterfaces.td"
 include "iree-dialects/Dialect/LinalgExt/IR/TiledOpInterface.td"
-include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/Interfaces/ControlFlowInterfaces.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/Interfaces/TilingInterface.td"
+include "mlir/Interfaces/ViewLikeInterface.td"
+
 
 //===----------------------------------------------------------------------===//
 // Base class.
@@ -28,9 +31,8 @@
          LinalgExtInterface,
          SingleBlockImplicitTerminator<"::mlir::iree_compiler::IREE::LinalgExt::YieldOp">
   ])> {
-  let verifier = [{ return verify$cppClass(*this); }];
-  let printer = [{ return print$cppClass(p, *this); }];
-  let parser = [{ return parse$cppClass(parser, result); }];
+  let hasVerifier = 1;
+  let hasCustomAssemblyFormat = 1;
   code extraLinalgExtOpClassDeclaration = [{
     SmallVector<Value> getDestinationOperands(OpBuilder &b) {
       SmallVector<Value> dest(outputs().begin(), outputs().end());
@@ -184,10 +186,7 @@
                               "getPartitionableLoops", "getTiledImplementation",
                               "generateScalarImplementation"
                             ]>,
-  DeclareOpInterfaceMethods<LinalgExtInterface,
-                            // FftOp does not have a region, so we have to
-                            // overwrite the method.
-                            ["payloadUsesValueFromOperand"]>
+  DeclareOpInterfaceMethods<LinalgExtInterface>
 ]> {
   let summary = "Fft operator";
   let description = [{
@@ -300,10 +299,7 @@
   DeclareOpInterfaceMethods<
       TiledOpInterface,
       ["generateScalarImplementation", "getTiledImplementation"]>,
-  DeclareOpInterfaceMethods<LinalgExtInterface,
-                            // ReverseOp does not have a region, so we have to
-                            // overwrite the method.
-                            ["payloadUsesValueFromOperand"]>]> {
+  DeclareOpInterfaceMethods<LinalgExtInterface>]> {
   let summary = "Reverse operator";
   let description = [{
     A temporary solution for lowering reverse ops into IREE, allowing IREE to
@@ -355,8 +351,8 @@
 def IREELinalgExt_YieldOp : IREELinalgExt_PureOp<"yield", [NoSideEffect, ReturnLike, Terminator]> {
   let summary = "LinalgExt yield op";
   let description = [{
-    `linalg_ext.yield` is a special terminator operation for blocks inside
-    regions in `linalg_ext` ops.
+    `iree_linalg_ext.yield` is a special terminator operation for blocks inside
+    regions in `iree_linalg_ext` ops.
   }];
 
   let arguments = (ins Variadic<AnyType>:$operands);
@@ -368,4 +364,268 @@
   let assemblyFormat = "attr-dict ($operands^ `:` type($operands))?";
 }
 
+//===----------------------------------------------------------------------===//
+// Ops supporting concurrency with tensors.
+//===----------------------------------------------------------------------===//
+
+def IREELinalgExt_TileOp : IREELinalgExt_PureOp<"tile",
+      [
+       // TODO: enable to allow hoisting, LICM and isDefinedOutside
+       // DeclareOpInterfaceMethods<LoopLikeOpInterface>,
+       SingleBlockImplicitTerminator<"::mlir::iree_compiler::IREE::LinalgExt::TileYieldOp">,
+       RecursiveSideEffects
+      ]> {
+  let summary = "tile operation";
+  let description = [{
+    `iree_linalg_ext.tile` is a 1-D loop construct that operates on tensors and
+    evaluates its body once for each tile. The number and size of tiles is
+    specified by the `tile_size` operand.
+
+    The `tile` op takes a list of destination-passing style tensors and returns
+    a matching list of tensors of the same size.
+
+    Every instance of the body is expected to return a tile with leading
+    dimension matching the corresponding tile size.
+
+    The default terminator behavior is such that tiles yielded by individual
+    iterations are concatenated along the `tiled_dim` dimension.
+    This is the canonical way to perform "subset insertions".
+    Note, if `tiled_dim` has the value `0`, it may be elided from pretty
+    pinting and parsing.
+
+    All return tiles are concatenated into forming the matching full result
+    tensor according to the terminator.
+
+    When the `tile_size` operand is a `tensor<..index>`, the `tile` op
+    evaluates its body `dim(tile_size, 0)` times. Each iteration `i` produces a
+    tile of leading size `tile_size[i]`.
+
+    The induced `offset` block argument captures the running sum of `tile_size`
+    for all the previous iterations.
+
+    When the `tile_size` operand is a single index, it is interpreted as a
+    sequence of tile sizes given by the following formula:
+    ```
+      N = tensor.dim(...)
+      S = sizes
+      T, R = divmod(N, S)
+      [T] * S + ([R] if R != 0 else [])
+    ```
+
+    All tiles except the last are of the same size.
+  }];
+  let arguments = (ins AnyTypeOf<[// TODO: allow TensorOf<[Index]>,
+                                  Index]>:$tile_size,
+                       Variadic<AnyRankedTensor>:$outs,
+                       I64Attr:$tiled_dim);
+  let results = (outs Variadic<AnyType>:$results);
+  let regions = (region SizedRegion<1>:$region);
+  let skipDefaultBuilders = 1;
+  let builders = [
+    // Builder that builds a tile on the implicit first dimension (i.e. `0`).
+    OpBuilder<(ins "Value":$tileSizes, "ValueRange":$outs,
+      CArg<"function_ref<void(OpBuilder &, Location, Value, Value, ValueRange)>",
+           "nullptr">)>,
+    // Builder that builds a tile with a specified integral dimension.
+    OpBuilder<(ins "Value":$tileSizes, "ValueRange":$outs, "int64_t":$tiledDims,
+      CArg<"function_ref<void(OpBuilder &, Location, Value, Value, ValueRange)>",
+           "nullptr">)>,
+  ];
+
+  let extraClassDeclaration = [{
+    static StringRef getTiledDimAttrName() { return "tiled_dim";}
+    using TileOpBodyBuilderFn =
+      function_ref<void(OpBuilder &, Location, Value /*offset*/, Value /*size*/,
+                        ValueRange /*outs*/)>;
+    // TODO: helper for getting named region args without magic constants etc.
+  }];
+
+  let hasCustomAssemblyFormat = 1;
+  let hasVerifier = 1;
+}
+
+def IREELinalgExt_TileYieldOp : IREELinalgExt_PureOp<"tile_yield", [
+    NoSideEffect, ReturnLike, Terminator]> {
+  let summary = "LinalgExt tile_yield op";
+  let description = [{
+    `iree_linalg_ext.tile_yield` is a special terminator operation for blocks inside
+    regions in `iree_linalg_ext.tile`.
+    The tiles yielded by individual iterations are concatenated along the first
+    dimension. This is the canonical way to perform "subset insertions"
+    (TODO: allow dim permutations).
+  }];
+
+  let arguments = (ins Variadic<AnyType>:$operands);
+
+  let builders = [
+    OpBuilder<(ins), [{ /* nothing to do */ }]>,
+  ];
+
+  let assemblyFormat = "attr-dict ($operands^ `:` type($operands))?";
+}
+
+def IREELinalgExt_InParallelOp : IREELinalgExt_PureOp<"in_parallel", [
+       SingleBlockImplicitTerminator<"::mlir::iree_compiler::IREE::LinalgExt::PerformConcurrentlyOp">,
+       RecursiveSideEffects,
+       AutomaticAllocationScope,
+      ]> {
+  let summary = "evaluate a block multiple times in parallel";
+  let description = [{
+    `iree_linalg_ext.in_parallel` is a target-independent parallel function application
+    operation. It has exactly one block that represents the parallel function body
+    and it takes a single index operand that indicates how many parallel instances
+    of that function should get instantiated.
+
+    When the parallel function body is pure (i.e. has no side effects) then the only
+    allowed terminator is `iree_linalg_ext.perform_concurrently`, which dictates
+    how the results of all parallel invocations should be reconciled into a full
+    value that will be returned from `in_parallel`. Multi-value returns are encoded
+    by including multiple operations inside the `perform_concurrently` block.
+
+    When the parallel function body has side effects, the order of reads and writes
+    to memory is unspecified across iterations.
+
+    This op resembles `scf.for` to a large degree, but crucially differs in that it
+    (1) doesn't have `iter_args` and (2) has a special terminator, both of which
+    enable reasoning about its parallel semantics. Another difference is that
+    `in_parallel` always iterates over a range between 0 and an upper bound, but
+    that's insignificant.
+  }];
+  let arguments = (ins Index:$num_threads);
+
+  let results = (outs Variadic<AnyType>:$results);
+  let regions = (region SizedRegion<1>:$region);
+
+  let hasCustomAssemblyFormat = 1;
+  let hasVerifier = 1;
+
+  // The default builder does not add the proper body BBargs, roll our own.
+  let skipDefaultBuilders = 1;
+  let builders = [
+    // Bodyless builder, result types must be specified.
+    OpBuilder<(ins "TypeRange":$resultTypes, "Value":$num_threads)>,
+    // Builder that takes a bodyBuilder lambda, result types are inferred from
+    // the terminator.
+    OpBuilder<(ins "Value":$num_threads,
+              "function_ref<void(OpBuilder &, Location, Value)>":$bodyBuilder)>
+  ];
+  let extraClassDeclaration = [{
+    Value getThreadIndex() { return getBody()->getArgument(0); }
+    static void ensureTerminator(Region &region, Builder &builder, Location loc);
+    PerformConcurrentlyOp getTerminator();
+  }];
+}
+
+def IREELinalgExt_PerformConcurrentlyOp : IREELinalgExt_PureOp<"perform_concurrently", [
+       NoSideEffect,
+       Terminator,
+       SingleBlockImplicitTerminator<"::mlir::iree_compiler::IREE::LinalgExt::EndPerformConcurrentlyOp">,
+      ]> {
+  let summary = "terminates a `in_parallel` block";
+  let description = [{
+    `iree_linalg_ext.perform_concurrently` is a designated terminator for the blocks
+    of `iree_linalg_ext.in_parallel` operations. The terminator contains a single block
+    itself, which describes how the results of each parallel invocation are to be
+    reconciled into a single value to be returned from the parallel invocation.
+    One operation in this terminator's block corresponds to a single return of
+    `in_parallel`.
+  }];
+
+  let regions = (region SizedRegion<1>:$region);
+
+  let hasCustomAssemblyFormat = 1;
+  let hasVerifier = 1;
+
+  // TODO(apaszke, ntv): Add an interface for ops that can appear inside
+  // perform_concurrently.
+  let extraClassDeclaration = [{
+    SmallVector<Type> yieldedTypes();
+    SmallVector<ParallelInsertSliceOp> yieldingOps();
+  }];
+}
+
+def IREELinalgExt_EndPerformConcurrentlyOp : IREELinalgExt_PureOp<"end_perform_concurrently", [
+       NoSideEffect, Terminator]> {
+  let summary = "terminates a `perform_concurrently` block";
+  let description = [{
+    A designated terminator for `perform_concurrently`. It's not expected to appear
+    in the textual form of the IR.
+  }];
+}
+
+def IREELinalgExt_ParallelInsertSliceOp : IREELinalgExt_PureOp<"parallel_insert_slice", [
+       AttrSizedOperandSegments, OffsetSizeAndStrideOpInterface]> {
+  let summary = "updates slices of a tensor concurrently";
+  let description = [{
+    Updates slices of a full tensor with multiple sub-slices concurrently.
+
+    Conflicting writes result in undefined semantics, in that the indices written
+    to by multiple parallel updates might contain data from any of the updates, or
+    even a malformed bit pattern (in reality the semantics might end up depending
+    on the memory model of the parallel hardware that `in_parallel` will be lowered to).
+
+    If an index is updated by exactly one updates, the value contained at that index
+    in the resulting tensor will be equal to the value at a corresponding index of a
+    slice that was used for the updated. If an index is not updated at all, its value
+    will be equal to the one in the original tensor.
+
+    Note that we cannot mark this operation as pure (NoSideEffects), even
+    though it has no side effects, because it will get DCEd during
+    canonicalization. Ideally we would use attributes instead of those funny
+    terminating ops, but attributes cannot refer to SSA values at the moment, so
+    it's the best we can do for now.
+  }];
+
+  let arguments = (ins
+    AnyRankedTensor:$source,
+    AnyRankedTensor:$dest,
+    Variadic<Index>:$offsets,
+    Variadic<Index>:$sizes,
+    Variadic<Index>:$strides,
+    I64ArrayAttr:$static_offsets,
+    I64ArrayAttr:$static_sizes,
+    I64ArrayAttr:$static_strides
+  );
+  let assemblyFormat = [{
+    $source `into` $dest ``
+    custom<OperandsOrIntegersOffsetsOrStridesList>($offsets, $static_offsets)
+    custom<OperandsOrIntegersSizesList>($sizes, $static_sizes)
+    custom<OperandsOrIntegersOffsetsOrStridesList>($strides, $static_strides)
+    attr-dict `:` type($source) `into` type($dest)
+  }];
+
+  let extraClassDeclaration = [{
+    Type yieldedType() { return dest().getType(); }
+
+    RankedTensorType getSourceType() {
+      return source().getType().cast<RankedTensorType>();
+    }
+
+    /// Return the expected rank of each of the `static_offsets`, `static_sizes`
+    /// and `static_strides` attributes.
+    std::array<unsigned, 3> getArrayAttrMaxRanks() {
+      unsigned rank = getSourceType().getRank();
+      return {rank, rank, rank};
+    }
+
+    /// Return the number of leading operands before `offsets`, `sizes` and
+    /// `strides` operands.
+    static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 1; }
+  }];
+
+  let builders = [
+    // Build a ParallelInsertSliceOp with mixed static and dynamic entries.
+    OpBuilder<(ins "Value":$source, "Value":$dest,
+      "ArrayRef<OpFoldResult>":$offsets, "ArrayRef<OpFoldResult>":$sizes,
+      "ArrayRef<OpFoldResult>":$strides,
+      CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>,
+    // Build a ParallelInsertSliceOp with dynamic entries.
+    OpBuilder<(ins "Value":$source, "Value":$dest,
+      "ValueRange":$offsets, "ValueRange":$sizes, "ValueRange":$strides,
+      CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>
+  ];
+
+  let hasCanonicalizer = 1;
+}
+
 #endif  // IREE_DIALECT_LINALGEXT_OPS
diff --git a/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/LinalgExtBufferization.h b/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/LinalgExtBufferization.h
new file mode 100644
index 0000000..c1b60b6
--- /dev/null
+++ b/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/LinalgExtBufferization.h
@@ -0,0 +1,27 @@
+//===-- LinalgExtBufferization.h - Linalg Extension bufferization ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef IREE_DIALECTS_DIALECT_LINALGEXT_BUFFERIZATION_H_
+#define IREE_DIALECTS_DIALECT_LINALGEXT_BUFFERIZATION_H_
+
+namespace mlir {
+
+class DialectRegistry;
+
+namespace iree_compiler {
+namespace IREE {
+namespace LinalgExt {
+
+void registerBufferizableOpInterfaceExternalModels(DialectRegistry &registry);
+
+}  // namespace LinalgExt
+}  // namespace IREE
+}  // namespace iree_compiler
+}  // namespace mlir
+
+#endif  // IREE_DIALECTS_DIALECT_LINALGEXT_BUFFERIZATION_H_
diff --git a/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/Passes/CMakeLists.txt b/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/Passes/CMakeLists.txt
new file mode 100644
index 0000000..07379ca
--- /dev/null
+++ b/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/Passes/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(LLVM_TARGET_DEFINITIONS Passes.td)
+mlir_tablegen(Passes.h.inc -gen-pass-decls)
+mlir_tablegen(Passes.capi.h.inc -gen-pass-capi-header)
+mlir_tablegen(Passes.capi.cpp.inc -gen-pass-capi-impl)
+add_public_tablegen_target(IREELinalgExtPassesIncGen)
diff --git a/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/Passes/PassDetail.h b/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/Passes/PassDetail.h
new file mode 100644
index 0000000..e5c044d
--- /dev/null
+++ b/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/Passes/PassDetail.h
@@ -0,0 +1,19 @@
+#ifndef IREE_DIALECTS_DIALECT_LINALGEXT_TRANSFORMS_PASS_DETAIL_H_
+#define IREE_DIALECTS_DIALECT_LINALGEXT_TRANSFORMS_PASS_DETAIL_H_
+
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+namespace iree_compiler {
+namespace IREE {
+namespace LinalgExt {
+
+#define GEN_PASS_CLASSES
+#include "iree-dialects/Dialect/LinalgExt/Passes/Passes.h.inc"  // IWYU pragma: keep
+
+}  // namespace LinalgExt
+}  // namespace IREE
+}  // namespace iree_compiler
+}  // namespace mlir
+
+#endif  // IREE_DIALECTS_DIALECT_LINALGEXT_TRANSFORMS_PASS_DETAIL_H_
diff --git a/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/Passes/Passes.h b/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/Passes/Passes.h
new file mode 100644
index 0000000..fb857f3
--- /dev/null
+++ b/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/Passes/Passes.h
@@ -0,0 +1,32 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_DIALECTS_DIALECT_LINALGEXT_TRANSFORMS_PASSES_H_
+#define IREE_DIALECTS_DIALECT_LINALGEXT_TRANSFORMS_PASSES_H_
+
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+namespace iree_compiler {
+namespace IREE {
+namespace LinalgExt {
+
+std::unique_ptr<OperationPass<FuncOp>> createTiledOpInterfaceTilingPass();
+
+std::unique_ptr<OperationPass<FuncOp>> createLinalgExtToLoopsPass();
+
+std::unique_ptr<OperationPass<>> createPadContractionToBlockSizePass();
+
+void registerTilingInterfaceExternalModels(DialectRegistry &registry);
+
+void registerPasses();
+
+}  // namespace LinalgExt
+}  // namespace IREE
+}  // namespace iree_compiler
+}  // namespace mlir
+
+#endif  // IREE_DIALECTS_DIALECT_LINALGEXT_TRANSFORMS_PASSES_H_
diff --git a/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/Passes/Passes.td b/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/Passes/Passes.td
new file mode 100644
index 0000000..54a0484
--- /dev/null
+++ b/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/Passes/Passes.td
@@ -0,0 +1,47 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_DIALECT_LINALGEXT_PASSES
+#define IREE_DIALECT_LINALGEXT_PASSES
+
+include "mlir/Pass/PassBase.td"
+
+def LinalgExtToLoops :
+    Pass<"iree-linalg-ext-to-loops", "FuncOp"> {
+  let summary = "Convert LinalgExt ops to loops and Linalg ops.";
+  let constructor = "mlir::iree_compiler::IREE::LinalgExt::createLinalgExtToLoopsPass()";
+}
+
+def TiledOpInterfaceTiling :
+    Pass<"iree-linalg-ext-tile", "FuncOp"> {
+  let summary = "Test pass for tiling using TiledOpInterface";
+  let constructor = "mlir::iree_compiler::IREE::LinalgExt::createTiledOpInterfaceTilingPass()";
+}
+
+def PadContractionToBlockSize :
+    Pass<"iree-linalg-pad-contraction-to-block-size", ""> {
+  let summary = "Pads contraction (matmul) ops to next multiple of block size";
+  let description = [{
+    This pass will apply padding to any supported linalg contractions:
+      * Row-major matmul:
+          Padded to <rowAlignment x columnAlignment>
+
+    Both rowAlignment and columnAlignment must be power-of-two values. If an
+    op is already statically padded properly, no change will be made. However,
+    if dynamic dimensions exist, padding will be applied regardless. Because
+    of the dynamic case, applying this pass multiple times can result in
+    mutation on each run.
+  }];
+  let constructor = "mlir::iree_compiler::IREE::LinalgExt::createPadContractionToBlockSizePass()";
+  let options = [
+    Option<"rowAlignment", "rowAlignment", "int", /*default=*/"16",
+           "The row-wise output block size">,
+    Option<"columnAlignment", "columnAlignment", "int", /*default=*/"16",
+           "The column-wise output block size">,
+  ];
+}
+
+#endif  // IREE_DIALECT_LINALGEXT_PASSES
diff --git a/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/Passes/Transforms.h b/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/Passes/Transforms.h
new file mode 100644
index 0000000..6fa1f51
--- /dev/null
+++ b/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/Passes/Transforms.h
@@ -0,0 +1,93 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_DIALECTS_DIALECT_LINALGEXT_TRANSFORMS_TRANSFORMS_H_
+#define IREE_DIALECTS_DIALECT_LINALGEXT_TRANSFORMS_TRANSFORMS_H_
+
+#include "iree-dialects/Dialect/LinalgExt/IR/TiledOpInterface.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/Linalg/Utils/Utils.h"
+
+namespace mlir {
+namespace iree_compiler {
+namespace IREE {
+namespace LinalgExt {
+
+/// Structure to represent the result of tiling operation.
+struct TiledOp {
+  /// Tiled op.
+  Operation *op;
+  /// Loops generated during tiling.
+  SmallVector<Operation *> loops;
+  /// Values that are replacements for the untiled operations.
+  SmallVector<Value> results;
+};
+
+/// Main entry point for tiling LinalgExtOps using TiledOpInterface.
+FailureOr<TiledOp> tileLinalgExtOp(OpBuilder &b, TiledOpInterface tilableOp,
+                                   const linalg::LinalgTilingOptions &options);
+
+/// Base rewrite pattern to tile and distribute operations that implement the
+/// `TiledOpInterface`.
+/// Base pattern for tiling TiledOpInterfaceOps.
+struct TiledOpInterfaceBaseTilingPattern
+    : public OpInterfaceRewritePattern<TiledOpInterface> {
+  TiledOpInterfaceBaseTilingPattern(MLIRContext *context,
+                                    linalg::LinalgTilingOptions options,
+                                    linalg::LinalgTransformationFilter filter =
+                                        linalg::LinalgTransformationFilter(),
+                                    PatternBenefit benefit = 1)
+      : OpInterfaceRewritePattern(context, benefit),
+        filter(filter),
+        options(options) {}
+
+  LogicalResult matchAndRewriteBase(TiledOpInterface tilableOp,
+                                    PatternRewriter &rewriter,
+                                    TiledOp &result) const;
+
+ private:
+  /// LinalgTransformMarker handles special attribute manipulations.
+  linalg::LinalgTransformationFilter filter;
+  /// Options to control tiling;
+  linalg::LinalgTilingOptions options;
+};
+
+struct TiledOpInterfaceTilingPattern
+    : public TiledOpInterfaceBaseTilingPattern {
+  TiledOpInterfaceTilingPattern(MLIRContext *context,
+                                linalg::LinalgTilingOptions options,
+                                linalg::LinalgTransformationFilter filter =
+                                    linalg::LinalgTransformationFilter(),
+                                PatternBenefit benefit = 1)
+      : TiledOpInterfaceBaseTilingPattern(context, options, filter, benefit) {}
+
+  LogicalResult matchAndRewrite(TiledOpInterface tilableOp,
+                                PatternRewriter &rewriter) const override {
+    TiledOp tiledOp;
+    // Check for failure.
+    if (failed(TiledOpInterfaceBaseTilingPattern::matchAndRewriteBase(
+            tilableOp, rewriter, tiledOp))) {
+      return failure();
+    }
+    // Check for do-nothing case.
+    if (!tiledOp.op) return failure();
+    if (tiledOp.op != tilableOp) {
+      if (tiledOp.results.empty()) {
+        rewriter.eraseOp(tilableOp);
+      } else {
+        rewriter.replaceOp(tilableOp, tiledOp.results);
+      }
+    }
+    return success();
+  }
+};
+
+}  // namespace LinalgExt
+}  // namespace IREE
+}  // namespace iree_compiler
+}  // namespace mlir
+
+#endif  // IREE_DIALECTS_DIALECT_LINALGEXT_TRANSFORMS_TRANSFORMS_H_
diff --git a/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/Transforms/Transforms.h b/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/Transforms/Transforms.h
index 6fa1f51..3099515 100644
--- a/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/Transforms/Transforms.h
+++ b/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/Transforms/Transforms.h
@@ -1,87 +1,93 @@
-// Copyright 2021 The IREE Authors
+//===- Transforms.h - LinalgExt transformations as patterns -----*- C++ -*-===//
 //
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
 
 #ifndef IREE_DIALECTS_DIALECT_LINALGEXT_TRANSFORMS_TRANSFORMS_H_
 #define IREE_DIALECTS_DIALECT_LINALGEXT_TRANSFORMS_TRANSFORMS_H_
 
-#include "iree-dialects/Dialect/LinalgExt/IR/TiledOpInterface.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
-#include "mlir/Dialect/Linalg/Utils/Utils.h"
+#include "mlir/IR/PatternMatch.h"
 
 namespace mlir {
+namespace scf {
+class ForOp;
+}
+
 namespace iree_compiler {
 namespace IREE {
 namespace LinalgExt {
 
-/// Structure to represent the result of tiling operation.
-struct TiledOp {
-  /// Tiled op.
-  Operation *op;
-  /// Loops generated during tiling.
-  SmallVector<Operation *> loops;
-  /// Values that are replacements for the untiled operations.
-  SmallVector<Value> results;
-};
+/// Pattern to tile a TilingInterface op using a TileOp.
+struct LinalgExtTilingPattern
+    : public OpInterfaceRewritePattern<TilingInterface> {
+  LinalgExtTilingPattern(MLIRContext *context, linalg::LinalgTilingOptions opt)
+      : OpInterfaceRewritePattern<TilingInterface>(context), options(opt) {}
 
-/// Main entry point for tiling LinalgExtOps using TiledOpInterface.
-FailureOr<TiledOp> tileLinalgExtOp(OpBuilder &b, TiledOpInterface tilableOp,
-                                   const linalg::LinalgTilingOptions &options);
+  FailureOr<Operation *> returningMatchAndRewrite(
+      TilingInterface op, PatternRewriter &rewriter) const;
 
-/// Base rewrite pattern to tile and distribute operations that implement the
-/// `TiledOpInterface`.
-/// Base pattern for tiling TiledOpInterfaceOps.
-struct TiledOpInterfaceBaseTilingPattern
-    : public OpInterfaceRewritePattern<TiledOpInterface> {
-  TiledOpInterfaceBaseTilingPattern(MLIRContext *context,
-                                    linalg::LinalgTilingOptions options,
-                                    linalg::LinalgTransformationFilter filter =
-                                        linalg::LinalgTransformationFilter(),
-                                    PatternBenefit benefit = 1)
-      : OpInterfaceRewritePattern(context, benefit),
-        filter(filter),
-        options(options) {}
-
-  LogicalResult matchAndRewriteBase(TiledOpInterface tilableOp,
-                                    PatternRewriter &rewriter,
-                                    TiledOp &result) const;
+  LogicalResult matchAndRewrite(TilingInterface op,
+                                PatternRewriter &rewriter) const override {
+    return returningMatchAndRewrite(op, rewriter);
+  }
 
  private:
-  /// LinalgTransformMarker handles special attribute manipulations.
-  linalg::LinalgTransformationFilter filter;
-  /// Options to control tiling;
   linalg::LinalgTilingOptions options;
 };
 
-struct TiledOpInterfaceTilingPattern
-    : public TiledOpInterfaceBaseTilingPattern {
-  TiledOpInterfaceTilingPattern(MLIRContext *context,
-                                linalg::LinalgTilingOptions options,
-                                linalg::LinalgTransformationFilter filter =
-                                    linalg::LinalgTransformationFilter(),
-                                PatternBenefit benefit = 1)
-      : TiledOpInterfaceBaseTilingPattern(context, options, filter, benefit) {}
+/// Pattern to rewrite a TileOp to an scf::ForOp.
+struct TileOpToSCFRewriter : public OpRewritePattern<TileOp> {
+  using OpRewritePattern::OpRewritePattern;
 
-  LogicalResult matchAndRewrite(TiledOpInterface tilableOp,
+  FailureOr<scf::ForOp> returningMatchAndRewrite(
+      TileOp tileOp, PatternRewriter &rewriter) const;
+
+  LogicalResult matchAndRewrite(TileOp tileOp,
                                 PatternRewriter &rewriter) const override {
-    TiledOp tiledOp;
-    // Check for failure.
-    if (failed(TiledOpInterfaceBaseTilingPattern::matchAndRewriteBase(
-            tilableOp, rewriter, tiledOp))) {
-      return failure();
-    }
-    // Check for do-nothing case.
-    if (!tiledOp.op) return failure();
-    if (tiledOp.op != tilableOp) {
-      if (tiledOp.results.empty()) {
-        rewriter.eraseOp(tilableOp);
-      } else {
-        rewriter.replaceOp(tilableOp, tiledOp.results);
-      }
-    }
-    return success();
+    return returningMatchAndRewrite(tileOp, rewriter);
+  }
+};
+
+/// Pattern to rewrite a TileOp to a InParallelOp.
+struct TileOpToInParallelRewriter : public OpRewritePattern<TileOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  FailureOr<InParallelOp> returningMatchAndRewrite(
+      TileOp tileOp, PatternRewriter &rewriter) const;
+
+  LogicalResult matchAndRewrite(TileOp tileOp,
+                                PatternRewriter &rewriter) const override {
+    return returningMatchAndRewrite(tileOp, rewriter);
+  }
+};
+
+/// Pattern to rewrite a InParallelOp to the async dialect.
+struct InParallelOpToAsyncRewriter : public OpRewritePattern<InParallelOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  FailureOr<Operation *> returningMatchAndRewrite(
+      InParallelOp inParallelOp, PatternRewriter &rewriter) const;
+
+  LogicalResult matchAndRewrite(InParallelOp inParallelOp,
+                                PatternRewriter &rewriter) const override {
+    return returningMatchAndRewrite(inParallelOp, rewriter);
+  }
+};
+
+/// Pattern to rewrite a InParallelOp to an scf::ForOp.
+struct InParallelOpToScfForRewriter : public OpRewritePattern<InParallelOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  FailureOr<scf::ForOp> returningMatchAndRewrite(
+      InParallelOp inParallelOp, PatternRewriter &rewriter) const;
+
+  LogicalResult matchAndRewrite(InParallelOp inParallelOp,
+                                PatternRewriter &rewriter) const override {
+    return returningMatchAndRewrite(inParallelOp, rewriter);
   }
 };
 
diff --git a/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/Transforms/Utils.h b/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/Transforms/Utils.h
new file mode 100644
index 0000000..534e794
--- /dev/null
+++ b/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/Transforms/Utils.h
@@ -0,0 +1,124 @@
+//===- Utils.h - Utils for simplifying writing transformations -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+
+#ifndef IREE_DIALECTS_DIALECT_LINALGEXT_TRANSFORMS_UTILS_H_
+#define IREE_DIALECTS_DIALECT_LINALGEXT_TRANSFORMS_UTILS_H_
+
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/Support/LLVM.h"
+
+namespace mlir {
+class Location;
+class OpBuilder;
+class Operation;
+class Value;
+
+namespace tensor {
+class ExtractSliceOp;
+}
+
+namespace iree_compiler {
+namespace IREE {
+namespace LinalgExt {
+
+/// Helper function which auto-completes the missing trailing dimensions to
+/// always be offset = 0, size = dim, stride = 1.
+void completeOffsetsSizesAndStrides(OpBuilder &b, Location loc, Value tensor,
+                                    ArrayRef<Value> leadingOffsets,
+                                    ArrayRef<Value> leadingSizes,
+                                    ArrayRef<Value> leadingStrides,
+                                    SmallVectorImpl<Value> &offsets,
+                                    SmallVectorImpl<Value> &sizes,
+                                    SmallVectorImpl<Value> &strides);
+
+/// Create a tensor::ExtractSliceOp by auto-completing the missing trailing
+/// dimensions to always be offset = 0, size = dim, stride = 1.
+Value createSubsetExtractOpFromLeadingOffsetsSizesAndStrides(
+    OpBuilder &b, Location loc, Value tensor,
+    llvm::ArrayRef<Value> leadingOffsets, ArrayRef<Value> leadingSizes,
+    ArrayRef<Value> leadingStrides);
+
+/// Create a tensor::InsertSliceOp by auto-completing the missing trailing
+/// dimensions to always be offset = 0, size = dim, stride = 1.
+Value createSubsetInsertOpFromLeadingOffsetsSizesAndStrides(
+    OpBuilder &b, Location loc, Value tensor, Value dest,
+    ArrayRef<Value> leadingOffsets, ArrayRef<Value> leadingSizes,
+    ArrayRef<Value> leadingStrides);
+
+/// Create a linalg_ext::ParallelInsertSliceOp by auto-completing the missing
+/// trailing dimensions to always be offset = 0, size = dim, stride = 1.
+Operation *createParallelInsertSliceOpFromLeadingOffsetsSizesAndStrides(
+    OpBuilder &b, Location loc, Value tensor, Value dest,
+    ArrayRef<Value> leadingOffsets, ArrayRef<Value> leadingSizes,
+    ArrayRef<Value> leadingStrides);
+
+/// Insert the `source` tensor into the `dest` tensor by creating the relevant
+/// `subset_insert` op. The details of the `subset_insert` op are retrieved
+/// from the `subset_extract` op so that they form a matching extract/insert
+/// pair.
+Value createMatchingSubsetInsertOp(OpBuilder &b, Location loc,
+                                   tensor::ExtractSliceOp subsetExtractOp,
+                                   Value source, Value dest);
+
+struct AffineValueExpr {
+  explicit AffineValueExpr(AffineExpr e) : e(e) {}
+  AffineValueExpr bind(Value v) {
+    this->v = v;
+    return *this;
+  }
+  operator AffineExpr() const { return e; }
+  operator Value() const { return v; }
+  AffineExpr e;
+  Value v;
+};
+
+/// Helper struct to build simple arithmetic quantiAffineValueExprs with minimal
+/// type inference support.
+// TODO: move into ArithBuilder once ops have been moved into arith.
+struct AffineBuilder {
+  AffineBuilder(OpBuilder &b, Location loc) : b(b), loc(loc) {}
+
+  Value add(AffineValueExpr lhs, AffineValueExpr rhs) {
+    return b.createOrFold<AffineApplyOp>(
+        loc, ArrayRef<AffineExpr>{lhs.e + rhs.e}, ValueRange{lhs, rhs});
+  }
+  Value sub(AffineValueExpr lhs, AffineValueExpr rhs) {
+    return b.createOrFold<AffineApplyOp>(
+        loc, ArrayRef<AffineExpr>{lhs.e - rhs.e}, ValueRange{lhs, rhs});
+  }
+  Value mul(AffineValueExpr lhs, AffineValueExpr rhs) {
+    return b.createOrFold<AffineApplyOp>(
+        loc, ArrayRef<AffineExpr>{lhs.e * rhs.e}, ValueRange{lhs, rhs});
+  }
+  Value ceil(AffineValueExpr lhs, AffineValueExpr rhs) {
+    return b.createOrFold<AffineApplyOp>(
+        loc, ArrayRef<AffineExpr>{lhs.e.ceilDiv(rhs.e)}, ValueRange{lhs, rhs});
+  }
+  Value min(ValueRange vals) {
+    return b.createOrFold<AffineMinOp>(
+        loc, AffineMap::getMultiDimIdentityMap(vals.size(), b.getContext()),
+        vals);
+  }
+  Value max(ValueRange vals) {
+    return b.createOrFold<AffineMinOp>(
+        loc, AffineMap::getMultiDimIdentityMap(vals.size(), b.getContext()),
+        vals);
+  }
+
+ private:
+  OpBuilder &b;
+  Location loc;
+};
+
+}  // namespace LinalgExt
+}  // namespace IREE
+}  // namespace iree_compiler
+}  // namespace mlir
+
+#endif  // IREE_DIALECTS_DIALECT_LINALGEXT_TRANSFORMS_UTILS_H_
diff --git a/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/PyDM/IR/PyDMBase.td b/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/PyDM/IR/PyDMBase.td
index 4f20e1d..c1d53cb 100644
--- a/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/PyDM/IR/PyDMBase.td
+++ b/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/PyDM/IR/PyDMBase.td
@@ -8,6 +8,7 @@
 #define IREE_DIALECTS_DIALECT_PYDM_IR_PYDM_BASE_TD
 
 include "mlir/IR/OpBase.td"
+include "mlir/IR/AttrTypeBase.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 
 def IREEPyDM_Dialect : Dialect {
@@ -34,14 +35,12 @@
 }
 
 class IREEPyDM_Op<string mnemonic, list<Trait> traits = []> :
-    Op<IREEPyDM_Dialect, mnemonic, traits> {
-  let verifier = [{ return ::verify(*this); }];
-}
+    Op<IREEPyDM_Dialect, mnemonic, traits> {}
 
 class IREEPyDM_PureOp<string mnemonic, list<Trait> traits = []> :
-    Op<IREEPyDM_Dialect, mnemonic, !listconcat(traits, [NoSideEffect])> {
-  let verifier = [{ return ::verify(*this); }];
-}
-class IREEPyDM_TypeDef<string name, list<Trait> traits = []> : TypeDef<IREEPyDM_Dialect, name, traits>;
+    Op<IREEPyDM_Dialect, mnemonic, !listconcat(traits, [NoSideEffect])> {}
+
+class IREEPyDM_TypeDef<string name, list<Trait> traits = []> : 
+  TypeDef<IREEPyDM_Dialect, name, traits>;
 
 #endif // IREE_DIALECTS_DIALECT_PYDM_IR_PYDM_BASE_TD
diff --git a/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/PyDM/IR/PyDMDialect.td b/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/PyDM/IR/PyDMDialect.td
index 2a30fdb..ef6d862 100644
--- a/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/PyDM/IR/PyDMDialect.td
+++ b/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/PyDM/IR/PyDMDialect.td
@@ -150,51 +150,7 @@
     bool isSigned() const;
   }];
 
-  let printer = [{
-    auto w = getImpl()->bitWidth;
-    if (w) {
-      $_printer << "<";
-      if (*w == 0) {
-        $_printer << "*";
-      } else if (*w > 0) {
-        $_printer << *w;
-      } else {
-        $_printer << "unsigned " << (-*w);
-      }
-      $_printer << ">";
-    }
-  }];
-
-  let parser = [{
-    auto emitError = [&]() -> InFlightDiagnostic{
-      return $_parser.emitError($_parser.getCurrentLocation());
-    };
-    // Weak
-    if (failed($_parser.parseOptionalLess()))
-      return get($_ctxt);
-    // AP
-    if (succeeded($_parser.parseOptionalStar())) {
-      if (failed($_parser.parseGreater()))
-        return Type();
-      return get($_ctxt, None);
-    }
-
-    // Explicit
-    bool isSigned;
-    if (succeeded($_parser.parseOptionalKeyword("unsigned"))) {
-      isSigned = false;
-    } else {
-      isSigned = true;
-    }
-
-    int width;
-    if (failed($_parser.parseInteger(width)))
-      return Type();
-    if (failed($_parser.parseGreater()))
-      return Type();
-    if (!isSigned) width = -width;
-    return getChecked(emitError, $_ctxt, width);
-  }];
+  let hasCustomAssemblyFormat = 1;
 }
 
 def IREEPyDM_ListType : IREEPyDM_PrimitiveTypeDef<"List", ["isRefinable"]> {
@@ -216,59 +172,7 @@
       return Base::get($_ctxt, CollectionStorageClass::Boxed, nullptr);
     }]>
   ];
-
-  let printer = [{
-    if (getImpl()->uniformElementType ||
-        getImpl()->storageClass != CollectionStorageClass::Boxed) {
-      $_printer << "<";
-      switch (getImpl()->storageClass) {
-        case CollectionStorageClass::Boxed:
-          $_printer << "boxed";
-          break;
-        case CollectionStorageClass::Empty:
-          $_printer << "empty";
-          break;
-        case CollectionStorageClass::Unboxed:
-          $_printer << "unboxed";
-          break;
-      }
-
-      if (getImpl()->uniformElementType) {
-        $_printer << ",";
-        $_printer << getImpl()->uniformElementType;
-      }
-      $_printer << ">";
-    }
-  }];
-
-  let parser = [{
-    if (parser.parseOptionalLess())
-      return get($_ctxt, CollectionStorageClass::Boxed, nullptr);
-
-    Type t;
-    StringRef storageClassKeyword;
-    if ($_parser.parseKeyword(&storageClassKeyword))
-      return Type();
-    if ($_parser.parseComma())
-      return Type();
-    if ($_parser.parseType(t))
-      return Type();
-    if ($_parser.parseGreater())
-      return Type();
-
-    CollectionStorageClass storageClass;
-    if (storageClassKeyword == "boxed")
-      storageClass = CollectionStorageClass::Boxed;
-    else if (storageClassKeyword == "empty")
-      storageClass = CollectionStorageClass::Empty;
-    else if (storageClassKeyword == "unboxed")
-      storageClass = CollectionStorageClass::Unboxed;
-    else {
-      $_parser.emitError($_parser.getCurrentLocation(), "expected one of 'boxed', 'empty', 'unboxed'");
-      return Type();
-    }
-    return get($_ctxt, storageClass, t);
-  }];
+  let hasCustomAssemblyFormat = 1;
 
   let extraClassDeclaration = [{
     /// Gets the type used to store elements in the backing list.
@@ -330,28 +234,7 @@
     bool isWeak() const;
     bool isExplicit() const { return !isWeak(); }
   }];
-
-  let printer = [{
-    auto ft = getImpl()->floatType;
-    if (ft)
-      $_printer << "<" << ft << ">";
-  }];
-
-  let parser = [{
-    auto emitError = [&]() -> InFlightDiagnostic{
-      return $_parser.emitError($_parser.getCurrentLocation());
-    };
-    // Weak
-    if (failed($_parser.parseOptionalLess()))
-      return get($_ctxt);
-    // Explicit
-    FloatType subType;
-    if (failed($_parser.parseType(subType)))
-      return Type();
-    if (failed($_parser.parseGreater()))
-      return Type();
-    return getChecked(emitError, $_ctxt, subType);
-  }];
+  let hasCustomAssemblyFormat = 1;
 }
 
 def IREEPyDM_StrType : IREEPyDM_PrimitiveTypeDef<"Str"> {
@@ -424,29 +307,7 @@
       return Base::get($_ctxt, nullptr);
     }]>
   ];
-
-  let printer = [{
-    if (getImpl()->primitiveType)
-      $_printer << "<" << getImpl()->primitiveType << ">";
-  }];
-
-  let parser = [{
-    if (parser.parseOptionalLess())
-      return get($_ctxt, nullptr);
-
-    Type t;
-    if ($_parser.parseType(t))
-      return Type();
-    if ($_parser.parseGreater())
-      return Type();
-    if (auto primitiveType = t.dyn_cast<PrimitiveType>())
-      return get($_ctxt, primitiveType);
-    else {
-      $_parser.emitError(
-          $_parser.getNameLoc(), "expected a primitive type");
-      return Type();
-    }
-  }];
+  let hasCustomAssemblyFormat = 1;
 
   let extraClassDeclaration = [{
     static bool isGenericObjectType(Type t) {
@@ -479,27 +340,7 @@
   );
 
   let genVerifyDecl = 1;
-  let printer = [{
-    llvm::interleaveComma(getAlternatives(), $_printer);
-  }];
-
-  let parser = [{
-    if (parser.parseOptionalLess())
-      return get($_ctxt, {});
-
-    SmallVector<::mlir::Type> alternatives;
-
-    do {
-      Type type;
-      if ($_parser.parseType(type))
-        return Type();
-      alternatives.push_back(type);
-    } while (succeeded($_parser.parseOptionalComma()));
-
-    return getChecked([&]() {
-      return $_parser.emitError($_parser.getNameLoc());
-    }, $_ctxt, alternatives);
-  }];
+  let hasCustomAssemblyFormat = 1;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/PyDM/IR/PyDMOps.td b/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/PyDM/IR/PyDMOps.td
index f6dba20..bc5b181 100644
--- a/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/PyDM/IR/PyDMOps.td
+++ b/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/PyDM/IR/PyDMOps.td
@@ -41,6 +41,7 @@
     $lhs `[` $slice `]` `=` $rhs `:` type(operands) attr-dict
   }];
   let hasCanonicalizer = 1;
+  let hasVerifier = 0;
 }
 
 //===----------------------------------------------------------------------===//
@@ -182,8 +183,9 @@
     }]>
   ];
 
-  let parser = [{ return ::parseFuncOp(parser, result); }];
-  let printer = [{ return ::print(*this, p); }];
+  // TODO: Enforce invariants.
+  let hasVerifier = 0;
+  let hasCustomAssemblyFormat = 1;
 }
 
 def IREEPyDM_ReturnOp : IREEPyDM_Op<"return", [
@@ -477,6 +479,7 @@
   let assemblyFormat = [{
     ($elements^ `:` type($elements))? `->` type(results) attr-dict
   }];
+  let hasVerifier = 1;
 }
 
 def IREEPyDM_MakeTupleOp : IREEPyDM_PureOp<"make_tuple"> {
@@ -607,8 +610,8 @@
   let results = (outs Variadic<AnyType>:$results);
   let regions = (region SizedRegion<1>:$thenRegion, AnyRegion:$elseRegion);
 
-  let printer = [{ return ::print(p, *this); }];
-  let parser = [{ return ::parse$cppClass(parser, result); }];
+  let hasVerifier = 1;
+  let hasCustomAssemblyFormat = 1;
 }
 
 def YieldOp : IREEPyDM_Op<"yield", [NoSideEffect, ReturnLike, Terminator,
diff --git a/integrations/tensorflow/iree-dialects/lib/CAPI/CMakeLists.txt b/integrations/tensorflow/iree-dialects/lib/CAPI/CMakeLists.txt
index fde1221..5c0e24d 100644
--- a/integrations/tensorflow/iree-dialects/lib/CAPI/CMakeLists.txt
+++ b/integrations/tensorflow/iree-dialects/lib/CAPI/CMakeLists.txt
@@ -4,6 +4,7 @@
   LINK_LIBS PUBLIC
   MLIRIR
   IREEInputDialect
+  IREELinalgExtDialect
   IREEPyDMDialect
   IREEPyDMPasses
 )
diff --git a/integrations/tensorflow/iree-dialects/lib/CAPI/Dialects.cpp b/integrations/tensorflow/iree-dialects/lib/CAPI/Dialects.cpp
index ac169f1..569e530 100644
--- a/integrations/tensorflow/iree-dialects/lib/CAPI/Dialects.cpp
+++ b/integrations/tensorflow/iree-dialects/lib/CAPI/Dialects.cpp
@@ -7,6 +7,7 @@
 #include "iree-dialects-c/Dialects.h"
 
 #include "iree-dialects/Dialect/Input/InputDialect.h"
+#include "iree-dialects/Dialect/LinalgExt/IR/LinalgExtDialect.h"
 #include "iree-dialects/Dialect/PyDM/IR/PyDMDialect.h"
 #include "iree-dialects/Dialect/PyDM/Transforms/Passes.h"
 #include "mlir/CAPI/IR.h"
@@ -27,6 +28,14 @@
 MLIR_DEFINE_CAPI_DIALECT_REGISTRATION(
     IREEInput, iree_input, mlir::iree_compiler::IREE::Input::IREEInputDialect)
 
+//===--------------------------------------------------------------------===//
+// IREELinalgExt
+//===--------------------------------------------------------------------===//
+
+MLIR_DEFINE_CAPI_DIALECT_REGISTRATION(
+    IREELinalgExt, iree_linalg_ext,
+    mlir::iree_compiler::IREE::LinalgExt::IREELinalgExtDialect)
+
 //===----------------------------------------------------------------------===//
 // IREEPyDMDialect
 //===----------------------------------------------------------------------===//
diff --git a/integrations/tensorflow/iree-dialects/lib/Dialect/Input/InputDialect.cpp b/integrations/tensorflow/iree-dialects/lib/Dialect/Input/InputDialect.cpp
index 060d308..a12a1b9 100644
--- a/integrations/tensorflow/iree-dialects/lib/Dialect/Input/InputDialect.cpp
+++ b/integrations/tensorflow/iree-dialects/lib/Dialect/Input/InputDialect.cpp
@@ -29,3 +29,41 @@
 #include "iree-dialects/Dialect/Input/InputOps.cpp.inc"
       >();
 }
+
+namespace mlir {
+namespace iree_compiler {
+namespace IREE {
+namespace Input {
+
+// ListType
+Type ListType::parse(AsmParser &parser) {
+  MLIRContext *ctxt = parser.getContext();
+  Type elementType;
+  if (parser.parseLess() || parser.parseType(elementType) ||
+      parser.parseGreater())
+    return Type();
+  return get(ctxt, elementType);
+}
+
+void ListType::print(AsmPrinter &printer) const {
+  printer << "<" << getElementType() << ">";
+}
+
+// PtrType
+Type PtrType::parse(AsmParser &parser) {
+  MLIRContext *ctxt = parser.getContext();
+  Type targetType;
+  if (parser.parseLess() || parser.parseType(targetType) ||
+      parser.parseGreater())
+    return Type();
+  return get(ctxt, targetType);
+}
+
+void PtrType::print(AsmPrinter &printer) const {
+  printer << "<" << getTargetType() << ">";
+}
+
+}  // namespace Input
+}  // namespace IREE
+}  // namespace iree_compiler
+}  // namespace mlir
diff --git a/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/CMakeLists.txt b/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/CMakeLists.txt
index 9f57627..126b878 100644
--- a/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/CMakeLists.txt
+++ b/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/CMakeLists.txt
@@ -1,2 +1,3 @@
 add_subdirectory(IR)
+add_subdirectory(Passes)
 add_subdirectory(Transforms)
diff --git a/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/IR/LinalgExtOps.cpp b/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/IR/LinalgExtOps.cpp
index 57f9d86..af9ae07 100644
--- a/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/IR/LinalgExtOps.cpp
+++ b/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/IR/LinalgExtOps.cpp
@@ -24,6 +24,7 @@
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/FunctionImplementation.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/OperationSupport.h"
@@ -103,48 +104,49 @@
 //===----------------------------------------------------------------------===//
 // ScatterOp
 //===----------------------------------------------------------------------===//
-static LogicalResult verifyScatterOp(ScatterOp op) {
-  if (op.inputs().size() != 2) {
-    return op.emitOpError("expected two input operands");
+LogicalResult ScatterOp::verify() {
+  Operation *op = getOperation();
+  if (inputs().size() != 2) {
+    return op->emitOpError("expected two input operands");
   }
-  if (op.outputs().size() != 1) {
-    return op.emitOpError("expected one output operand");
+  if (outputs().size() != 1) {
+    return op->emitOpError("expected one output operand");
   }
   auto checkDimensionsMatch = [&](ShapedType t1, ShapedType t2, unsigned dim) {
     return t1.getShape()[dim] == t2.getShape()[dim];
   };
 
-  auto indicesType = op.getIndicesType();
+  auto indicesType = getIndicesType();
   if (indicesType.getRank() != 2 ||
       !indicesType.getElementType().isInteger(32)) {
-    return op.emitOpError(
+    return op->emitOpError(
         "expected indices to be of rank 2 of i32 element type");
   }
-  auto indexDepth = op.getIndexDepth();
+  auto indexDepth = getIndexDepth();
   if (indexDepth == ShapedType::kDynamicSize) {
-    return op.emitOpError("expected index depth is static");
+    return op->emitOpError("expected index depth is static");
   }
 
   // The first dimension of the indices should match the first dimension of the
   // output. They indicate to the number of updates.
-  auto updateType = op.getUpdateType();
+  auto updateType = getUpdateType();
   if (updateType.getRank() < 1) {
-    return op.emitOpError("expected update value to be at least rank 1");
+    return op->emitOpError("expected update value to be at least rank 1");
   }
   if (!checkDimensionsMatch(indicesType, updateType, 0)) {
-    return op.emitOpError(
+    return op->emitOpError(
         "mismatch in shape of indices and update value at dim#0");
   }
-  auto originalType = op.getOriginalType();
+  auto originalType = getOriginalType();
   if (updateType.getRank() - 1 > originalType.getRank()) {
-    return op.emitOpError(
+    return op->emitOpError(
         "update value rank exceeds the rank of the original value");
   }
 
   // indexDepth + update dims should cover the original dims. The first dim of
   // update is the number of updates.
   if (originalType.getRank() > indexDepth + updateType.getRank() - 1) {
-    return op.emitOpError(
+    return op->emitOpError(
         "index depth and update value does not cover rank of original value");
   }
 
@@ -159,7 +161,7 @@
     int64_t updateDim = std::get<1>(it);
     if (updateType.getDimSize(updateDim) !=
         originalType.getDimSize(originalDim)) {
-      return op.emitOpError("mismatch in shape of update value dim#")
+      return op->emitOpError("mismatch in shape of update value dim#")
              << updateDim << " and original value at dim#" << originalDim;
     }
   }
@@ -173,36 +175,36 @@
     int64_t updateDim = std::get<1>(it);
     if (updateType.getDimSize(updateDim) >
         originalType.getDimSize(originalDim)) {
-      return op.emitOpError("indexed shape of update value dim#")
+      return op->emitOpError("indexed shape of update value dim#")
              << updateDim << " exceeds original value at dim#" << originalDim
              << " " << updateType.getDimSize(updateDim) << " "
              << originalType.getDimSize(originalDim);
     }
   }
 
-  Region &region = op.region();
+  Region &region = this->region();
   Block *body = &region.front();
   if (body->getNumArguments() != 2) {
-    return op.emitOpError("expected region to have two arguments");
+    return op->emitOpError("expected region to have two arguments");
   }
   Type arg0Type = body->getArgument(0).getType();
   Type arg1Type = body->getArgument(1).getType();
   if (!arg0Type.isIntOrFloat() || !arg1Type.isIntOrFloat()) {
-    return op.emitOpError(
+    return op->emitOpError(
         "expected region to have scalar argument of integer or float types");
   }
   if (arg0Type != updateType.getElementType()) {
-    return op.emitOpError("mismatch in argument 0 of region ")
+    return op->emitOpError("mismatch in argument 0 of region ")
            << arg0Type << " and element type of update value "
            << updateType.getElementType();
   }
   if (arg1Type != originalType.getElementType()) {
-    return op.emitOpError("mismatch in argument 1 of region ")
+    return op->emitOpError("mismatch in argument 1 of region ")
            << arg1Type << " and element type of original value "
            << originalType.getElementType();
   }
   if (arg0Type != arg1Type) {
-    return op.emitOpError("mismatch in region argument types ")
+    return op->emitOpError("mismatch in region argument types ")
            << arg0Type << " and " << arg1Type;
   }
   auto yieldOp = cast<IREE::LinalgExt::YieldOp>(body->getTerminator());
@@ -353,44 +355,45 @@
 // SortOp
 //===----------------------------------------------------------------------===//
 
-static LogicalResult verifySortOp(SortOp op) {
-  if (op.getNumInputs()) {
-    return op.emitOpError("does not expect to take any inputs");
+LogicalResult SortOp::verify() {
+  Operation *op = getOperation();
+  if (getNumInputs()) {
+    return op->emitOpError("does not expect to take any inputs");
   }
-  if (op.getNumOutputs() == 0) {
-    return op.emitOpError("expected at least one `outs` operand");
+  if (getNumOutputs() == 0) {
+    return op->emitOpError("expected at least one `outs` operand");
   }
 
-  Block &block = op.region().front();
-  size_t numOutputs = op.getNumOutputs();
+  Block &block = region().front();
+  size_t numOutputs = getNumOutputs();
   if (block.getNumArguments() != 2 * numOutputs) {
-    return op.emitOpError("region block should have ")
+    return op->emitOpError("region block should have ")
            << 2 * numOutputs << " arguments";
   }
 
-  int64_t rank = op.getOperandRank();
-  int sortDim = op.dimension();
+  int64_t rank = getOperandRank();
+  int sortDim = dimension();
   if (sortDim < 0 || sortDim >= rank) {
-    return op.emitOpError("dimension must be within (0, ") << rank << "]";
+    return op->emitOpError("dimension must be within (0, ") << rank << "]";
   }
 
-  ArrayRef<int64_t> shape = op.getOperandShape();
-  for (auto indexedOperand : llvm::enumerate(op.outputs())) {
+  ArrayRef<int64_t> shape = getOperandShape();
+  for (auto indexedOperand : llvm::enumerate(outputs())) {
     int index = indexedOperand.index();
-    auto operandType = op.getOperandType(index);
+    auto operandType = getOperandType(index);
     if (operandType.getRank() != rank) {
-      return op.emitOpError("expected operand ")
+      return op->emitOpError("expected operand ")
              << index << " to be rank " << rank << ", same as other operands";
     }
     if (operandType.getShape() != shape) {
-      return op.emitOpError("expected operand ")
+      return op->emitOpError("expected operand ")
              << index << " to have same shape as other operands";
     }
     Type elemType = operandType.getElementType();
     for (int i : {2 * index, 2 * index + 1}) {
       Type argType = block.getArgument(i).getType();
       if (argType != elemType) {
-        return op.emitOpError("region block argument #")
+        return op->emitOpError("region block argument #")
                << i << " should be of type " << elemType << " but got "
                << argType;
       }
@@ -399,11 +402,11 @@
 
   auto yieldOp = cast<YieldOp>(block.getTerminator());
   if (yieldOp.getNumOperands() != 1) {
-    return op.emitOpError("should yield exactly one operand");
+    return op->emitOpError("should yield exactly one operand");
   }
   auto ty = yieldOp.getOperand(0).getType().dyn_cast<IntegerType>();
   if (!ty || ty.getWidth() != 1) {
-    return op.emitOpError("should yield i1 type");
+    return op->emitOpError("should yield i1 type");
   }
 
   return success();
@@ -559,26 +562,28 @@
 // FftOp
 //===----------------------------------------------------------------------===//
 
-static LogicalResult verifyFftOp(FftOp op) {
-  auto length = op.getFftLength();
+LogicalResult FftOp::verify() {
+  Operation *op = getOperation();
+  auto length = getFftLength();
   // After tiling, it could be dynamic shape. (Because
   // subview/subtensor does not inference the type correctly
   // on (1 << x)) cases).
   if (length == ShapedType::kDynamicSize) return success();
   if (length & (length - 1)) {
-    return op.emitOpError("only powers of 2 are handled currently");
+    return op->emitOpError("only powers of 2 are handled currently");
   }
-  if (!op.getNumInputs() || !op.isScalar(op.getInputOperand(0))) {
-    return op.emitOpError("expected to carry `stage` input");
+  if (!getNumInputs() || !isScalar(getInputOperand(0))) {
+    return op->emitOpError("expected to carry `stage` input");
   }
-  if (op.getNumInputs() != 1) {
-    if (op.getNumInputs() != 3 || op.isScalar(op.getInputOperand(1)) ||
-        op.isScalar(op.getInputOperand(2))) {
-      return op.emitOpError("expected to carry real and imag coeff inputs");
+  if (getNumInputs() != 1) {
+    if (getNumInputs() != 3 || isScalar(getInputOperand(1)) ||
+        isScalar(getInputOperand(2))) {
+      return op->emitOpError("expected to carry real and imag coeff inputs");
     }
   }
-  if (op.getNumOutputs() != 2) {
-    return op.emitOpError("expected outputs to be real and imag tensor/memref");
+  if (getNumOutputs() != 2) {
+    return op->emitOpError(
+        "expected outputs to be real and imag tensor/memref");
   }
   return success();
 }
@@ -758,8 +763,6 @@
   return success();
 }
 
-bool FftOp::payloadUsesValueFromOperand(OpOperand *) { return false; }
-
 SmallVector<unsigned> FftOp::getPartitionableLoops(
     unsigned maxNumParallelDims) {
   auto range = llvm::seq<unsigned>(0, getOperandRank());
@@ -811,34 +814,35 @@
 // ScanOp
 //===----------------------------------------------------------------------===//
 
-static LogicalResult verifyScanOp(ScanOp op) {
-  if (op.getNumInputs() != 1) {
-    return op.emitOpError("expected one input operands");
+LogicalResult ScanOp::verify() {
+  Operation *op = getOperation();
+  if (getNumInputs() != 1) {
+    return op->emitOpError("expected one input operands");
   }
-  if (op.getNumOutputs() != 2) {
-    return op.emitOpError("expected two output operands");
+  if (getNumOutputs() != 2) {
+    return op->emitOpError("expected two output operands");
   }
-  if (!op.input().getType().isa<ShapedType>()) {
-    return op.emitOpError("expected first input element type to be shaped");
+  if (!input().getType().isa<ShapedType>()) {
+    return op->emitOpError("expected first input element type to be shaped");
   }
-  auto accumulatorType = op.accumulator().getType().cast<ShapedType>();
-  auto inputType = op.input().getType().cast<ShapedType>();
-  auto outputType = op.output().getType().cast<ShapedType>();
+  auto accumulatorType = accumulator().getType().cast<ShapedType>();
+  auto inputType = input().getType().cast<ShapedType>();
+  auto outputType = output().getType().cast<ShapedType>();
   ArrayRef<int64_t> inputShapes = inputType.getShape();
   ArrayRef<int64_t> outputShapes = outputType.getShape();
   if (accumulatorType.getElementType() != inputType.getElementType()) {
-    return op.emitOpError(
+    return op->emitOpError(
         "expected input/accumulator element types to be identical");
   }
   ArrayRef<int64_t> accumulatorShape = accumulatorType.getShape();
   int64_t accumulatorRank = accumulatorType.getRank();
   if (accumulatorRank != inputType.getRank() - 1) {
-    return op.emitOpError(
+    return op->emitOpError(
         "expected accumulator rank to be equal to input rank - 1");
   }
   SmallVector<int64_t> expectedAccumulatorShape;
   for (int i = 0; i < inputType.getRank(); i++) {
-    if (i != op.dimension()) expectedAccumulatorShape.push_back(inputShapes[i]);
+    if (i != dimension()) expectedAccumulatorShape.push_back(inputShapes[i]);
   }
   if (llvm::any_of(llvm::zip(expectedAccumulatorShape, accumulatorShape),
                    [](std::tuple<int64_t, int64_t> s) {
@@ -846,14 +850,14 @@
                             std::get<1>(s) != ShapedType::kDynamicSize &&
                             std::get<0>(s) != std::get<1>(s);
                    })) {
-    return op.emitOpError("incompatible input/accumulator shapes");
+    return op->emitOpError("incompatible input/accumulator shapes");
   }
   if (inputType.getElementType() != outputType.getElementType()) {
-    return op.emitOpError(
+    return op->emitOpError(
         "expected input/output element types to be identical");
   }
   if (inputShapes.size() != outputShapes.size()) {
-    return op.emitOpError("expected input/output to have identical ranks");
+    return op->emitOpError("expected input/output to have identical ranks");
   }
   if (llvm::any_of(llvm::zip(inputShapes, outputShapes),
                    [](std::tuple<int64_t, int64_t> s) {
@@ -861,7 +865,7 @@
                             std::get<1>(s) != ShapedType::kDynamicSize &&
                             std::get<0>(s) != std::get<1>(s);
                    })) {
-    return op.emitOpError("incompatible input/output shapes");
+    return op->emitOpError("incompatible input/output shapes");
   }
   return success();
 }
@@ -1043,23 +1047,24 @@
 // ReverseOp
 //===----------------------------------------------------------------------===//
 
-static LogicalResult verifyReverseOp(ReverseOp op) {
-  if (op.getNumInputs() != 1) {
-    return op.emitOpError("expected exactly one input");
+LogicalResult ReverseOp::verify() {
+  Operation *op = getOperation();
+  if (getNumInputs() != 1) {
+    return op->emitOpError("expected exactly one input");
   }
-  if (op.getNumOutputs() != 1) {
-    return op.emitOpError("expected exactly one output");
+  if (getNumOutputs() != 1) {
+    return op->emitOpError("expected exactly one output");
   }
-  auto inputType = op.input().getType().cast<ShapedType>();
-  auto outputType = op.output().getType().cast<ShapedType>();
+  auto inputType = input().getType().cast<ShapedType>();
+  auto outputType = output().getType().cast<ShapedType>();
   if (inputType.getElementType() != outputType.getElementType()) {
-    return op.emitOpError(
+    return op->emitOpError(
         "expected input/output element types to be identical");
   }
   ArrayRef<int64_t> inputShapes = inputType.getShape();
   ArrayRef<int64_t> outputShapes = outputType.getShape();
   if (inputShapes.size() != outputShapes.size()) {
-    return op.emitOpError("expexted input/output to have identical ranks");
+    return op->emitOpError("expexted input/output to have identical ranks");
   }
   if (llvm::any_of(llvm::zip(inputShapes, outputShapes),
                    [](std::tuple<int64_t, int64_t> s) {
@@ -1067,18 +1072,18 @@
                             std::get<1>(s) != ShapedType::kDynamicSize &&
                             std::get<0>(s) != std::get<1>(s);
                    })) {
-    return op.emitOpError("incompatible input/output shapes");
+    return op->emitOpError("incompatible input/output shapes");
   }
 
-  int64_t rank = op.getOperandRank();
+  int64_t rank = getOperandRank();
   llvm::SmallSetVector<int64_t, 4> s;
-  for (auto dim : op.dims()) {
+  for (auto dim : dims()) {
     if (dim < 0 || dim >= rank) {
-      return op.emitOpError("all the dimensions must be within [0, ")
+      return op->emitOpError("all the dimensions must be within [0, ")
              << rank << ")";
     }
     if (s.contains(dim)) {
-      return op.emitOpError("expected dimensions numbers are all unique");
+      return op->emitOpError("expected dimensions numbers are all unique");
     }
     s.insert(dim);
   }
@@ -1086,8 +1091,6 @@
   return success();
 }
 
-bool ReverseOp::payloadUsesValueFromOperand(OpOperand *) { return false; }
-
 SmallVector<StringRef> ReverseOp::getLoopIteratorTypes() {
   SmallVector<StringRef> iteratorTypes(getOperandRank(),
                                        getParallelIteratorTypeName());
@@ -1246,6 +1249,388 @@
 }  // namespace
 
 //===----------------------------------------------------------------------===//
+// TileOp
+//===----------------------------------------------------------------------===//
+
+void TileOp::build(mlir::OpBuilder &builder, mlir::OperationState &result,
+                   Value tileSize, ValueRange outs, int64_t tiledDim,
+                   TileOp::TileOpBodyBuilderFn bodyBuilder) {
+  result.addOperands(tileSize);
+  result.addOperands(outs);
+  result.addAttribute(TileOp::getTiledDimAttrName(),
+                      builder.getI64IntegerAttr(tiledDim));
+  result.addTypes(outs.getType());
+
+  Region *bodyRegion = result.addRegion();
+  bodyRegion->push_back(new Block);
+  Block &bodyBlock = bodyRegion->front();
+  // TODO: Pass a better location here.
+  Location loc = tileSize.getLoc();
+  bodyBlock.addArgument(builder.getIndexType(), loc);
+  bodyBlock.addArgument(builder.getIndexType(), loc);
+  // Handle the sliced out types in a conservative fashion: all dimensions
+  // become dynamic and a later canonicalization is expected to recover static
+  // types.
+  // TODO: should we relax this and use something less strict?
+  auto dynamicTypes =
+      llvm::to_vector(llvm::map_range(outs.getTypes(), [](Type t) -> Type {
+        auto rankedTensorType = t.cast<RankedTensorType>();
+        RankedTensorType::Builder rttb(rankedTensorType);
+        SmallVector<int64_t> dynamicShape(rankedTensorType.getRank(),
+                                          ShapedType::kDynamicSize);
+        return rttb.setShape(dynamicShape);
+      }));
+  SmallVector<Location> locs(dynamicTypes.size(), loc);
+  bodyBlock.addArguments(dynamicTypes, locs);
+
+  OpBuilder::InsertionGuard guard(builder);
+  builder.setInsertionPointToStart(&bodyBlock);
+  bodyBuilder(builder, result.location, bodyBlock.getArgument(0),
+              bodyBlock.getArgument(1), bodyBlock.getArguments().drop_front(2));
+}
+
+void TileOp::build(mlir::OpBuilder &builder, mlir::OperationState &result,
+                   Value tileSize, ValueRange outs,
+                   TileOp::TileOpBodyBuilderFn bodyBuilder) {
+  TileOp::build(builder, result, tileSize, outs, 0, bodyBuilder);
+}
+
+// TODO(#81): Impl me.
+LogicalResult TileOp::verify() { return success(); }
+
+void TileOp::print(OpAsmPrinter &p) {
+  p << ' ' << tile_size() << ' ';
+  if (tiled_dim() > 0) p << "tiled_dim = " << tiled_dim() << ' ';
+  if (!outs().empty()) {
+    p << "outs(";
+    llvm::interleaveComma(outs(), p,
+                          [&p](Value v) { p << v << ": " << v.getType(); });
+    p << ')';
+  }
+  p << " -> (" << getResultTypes() << ") ";
+  p.printRegion(region(),
+                /*printEntryBlockArgs=*/true,
+                /*printBlockTerminators=*/true);
+  p.printOptionalAttrDict(getOperation()->getAttrs(),
+                          /*elidedAttrs=*/{TileOp::getTiledDimAttrName()});
+}
+
+ParseResult TileOp::parse(OpAsmParser &parser, OperationState &result) {
+  auto &builder = parser.getBuilder();
+
+  OpAsmParser::OperandType tileSizes;
+  // TODO: also allow tensor<..xindex> and figure out a good syntax.
+  // Type tensorOfIndexType =
+  //     RankedTensorType::get({ShapedType::kDynamicSize}, indexType);
+  Type tileSizesType = builder.getIndexType();
+  SmallVector<Type> outsTypes;
+  SmallVector<OpAsmParser::OperandType, 4> outsOperands;
+
+  llvm::SMLoc outputsOperandsLoc;
+  if (parser.parseOperand(tileSizes) ||
+      parser.resolveOperand(tileSizes, tileSizesType, result.operands))
+    return failure();
+
+  // Parse the `tiled_dim` attribute or set it to 0 implicitly when elided.
+  if (succeeded(parser.parseOptionalKeyword(TileOp::getTiledDimAttrName()))) {
+    outputsOperandsLoc = parser.getCurrentLocation();
+    Attribute valueAttr;
+    parser.parseAttribute(valueAttr, TileOp::getTiledDimAttrName(),
+                          result.attributes);
+  } else {
+    result.attributes.append(TileOp::getTiledDimAttrName(),
+                             parser.getBuilder().getI64IntegerAttr(0));
+  }
+
+  if (succeeded(parser.parseOptionalKeyword("outs"))) {
+    bool _1;
+    SmallVector<NamedAttrList> _2;
+    SmallVector<Location> _3;
+    outputsOperandsLoc = parser.getCurrentLocation();
+    if (mlir::function_interface_impl::parseFunctionArgumentList(
+            parser,
+            /*allowAttributes=*/false,
+            /*allowVariadic=*/false, outsOperands, outsTypes, /*argAttrs=*/_2,
+            /*argLocations=*/_3,
+            /*isVariadic=*/_1) ||
+        parser.resolveOperands(outsOperands, outsTypes, outputsOperandsLoc,
+                               result.operands))
+      return failure();
+  }
+  if (parser.parseArrowTypeList(result.types)) return failure();
+
+  SmallVector<OpAsmParser::OperandType, 8> regionOperands;
+  std::unique_ptr<Region> region = std::make_unique<Region>();
+  SmallVector<Type, 8> operandTypes, regionTypes;
+  if (parser.parseRegion(*region, regionOperands, regionTypes))
+    return failure();
+
+  // Parse the optional attribute list.
+  if (parser.parseOptionalAttrDict(result.attributes)) return failure();
+
+  TileOp::ensureTerminator(*region, builder, result.location);
+  result.addRegion(std::move(region));
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// InParallelOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult InParallelOp::verify() {
+  // Check that the body defines as single block argument for the thread index.
+  auto *body = getBody();
+  if (body->getNumArguments() != 1)
+    return emitOpError("body expects exactly one argument");
+  if (!body->getArgument(0).getType().isIndex())
+    return emitOpError(
+        "expected body first argument to be an index argument for "
+        "the thread index");
+
+  // Verify consistency between the result types and the terminator.
+  auto terminatorTypes = getTerminator().yieldedTypes();
+  auto opResults = getResults();
+  if (opResults.size() != terminatorTypes.size())
+    return emitOpError("produces ")
+           << opResults.size() << " results, but its terminator yields "
+           << terminatorTypes.size() << " values";
+  unsigned i = 0;
+  for (auto e : llvm::zip(terminatorTypes, opResults)) {
+    if (std::get<0>(e) != std::get<1>(e).getType())
+      return emitOpError() << "type mismatch between " << i
+                           << "th result of in_parallel (" << std::get<0>(e)
+                           << ") and " << i << "th result yielded by its "
+                           << "terminator (" << std::get<1>(e).getType() << ")";
+    i++;
+  }
+
+  return success();
+}
+
+void InParallelOp::print(OpAsmPrinter &p) {
+  p << ' ' << num_threads() << ' ';
+  p << " -> (" << getResultTypes() << ") ";
+  p.printRegion(region(),
+                /*printEntryBlockArgs=*/true,
+                /*printBlockTerminators=*/true);
+  p.printOptionalAttrDict(getOperation()->getAttrs());
+}
+
+ParseResult InParallelOp::parse(OpAsmParser &parser, OperationState &result) {
+  auto &builder = parser.getBuilder();
+
+  OpAsmParser::OperandType numThreads;
+  Type indexType = builder.getIndexType();
+
+  if (parser.parseOperand(numThreads) ||
+      parser.resolveOperand(numThreads, indexType, result.operands))
+    return failure();
+  if (parser.parseArrowTypeList(result.types)) return failure();
+
+  SmallVector<OpAsmParser::OperandType, 8> regionOperands;
+  SmallVector<Type, 8> regionTypes;
+  std::unique_ptr<Region> region = std::make_unique<Region>();
+  if (parser.parseRegion(*region, regionOperands, regionTypes))
+    return failure();
+  InParallelOp::ensureTerminator(*region, builder, result.location);
+  result.addRegion(std::move(region));
+
+  // Parse the optional attribute list.
+  if (parser.parseOptionalAttrDict(result.attributes)) return failure();
+  return success();
+}
+
+// Bodyless builder, result types must be specified.
+void InParallelOp::build(mlir::OpBuilder &builder, mlir::OperationState &result,
+                         TypeRange resultTypes, Value numThreads) {
+  // TODO: Pass better location.
+  Location loc = numThreads.getLoc();
+  result.addOperands(numThreads);
+
+  Region *bodyRegion = result.addRegion();
+  bodyRegion->push_back(new Block);
+  Block &bodyBlock = bodyRegion->front();
+  bodyBlock.addArgument(builder.getIndexType(), loc);
+
+  // Create the default terminator if the builder is not provided and if the
+  // iteration arguments are not provided. Otherwise, leave this to the caller
+  // because we don't know which values to return from the loop.
+  InParallelOp::ensureTerminator(*bodyRegion, builder, result.location);
+  result.addTypes(resultTypes);
+}
+
+// Builder that takes a bodyBuilder lambda, result types are inferred from
+// the terminator.
+void InParallelOp::build(
+    mlir::OpBuilder &builder, mlir::OperationState &result, Value numThreads,
+    function_ref<void(OpBuilder &, Location, Value)> bodyBuilder) {
+  // TODO: Pass better location.
+  Location loc = numThreads.getLoc();
+  result.addOperands(numThreads);
+
+  Region *bodyRegion = result.addRegion();
+  bodyRegion->push_back(new Block);
+  Block &bodyBlock = bodyRegion->front();
+  bodyBlock.addArgument(builder.getIndexType(), loc);
+
+  OpBuilder::InsertionGuard guard(builder);
+  builder.setInsertionPointToStart(&bodyBlock);
+  bodyBuilder(builder, result.location, bodyBlock.getArgument(0));
+  auto terminator =
+      llvm::cast<PerformConcurrentlyOp>(bodyBlock.getTerminator());
+  result.addTypes(terminator.yieldedTypes());
+}
+
+// The ensureTerminator method generated by SingleBlockImplicitTerminator is
+// unaware of the fact that our terminator also needs a region to be well
+// formed. We override it here to ensure that we do the right thing.
+void InParallelOp::ensureTerminator(Region &region, Builder &builder,
+                                    Location loc) {
+  OpTrait::SingleBlockImplicitTerminator<PerformConcurrentlyOp>::Impl<
+      InParallelOp>::ensureTerminator(region, builder, loc);
+  auto terminator =
+      llvm::dyn_cast<PerformConcurrentlyOp>(region.front().getTerminator());
+  PerformConcurrentlyOp::ensureTerminator(terminator.getRegion(), builder, loc);
+}
+
+PerformConcurrentlyOp InParallelOp::getTerminator() {
+  return cast<PerformConcurrentlyOp>(getBody()->getTerminator());
+}
+
+//===----------------------------------------------------------------------===//
+// ParallelInsertSliceOp
+//===----------------------------------------------------------------------===//
+
+// Build a ParallelInsertSliceOp with mixed static and dynamic entries.
+void ParallelInsertSliceOp::build(OpBuilder &b, OperationState &result,
+                                  Value source, Value dest,
+                                  ArrayRef<OpFoldResult> offsets,
+                                  ArrayRef<OpFoldResult> sizes,
+                                  ArrayRef<OpFoldResult> strides,
+                                  ArrayRef<NamedAttribute> attrs) {
+  SmallVector<int64_t> staticOffsets, staticSizes, staticStrides;
+  SmallVector<Value> dynamicOffsets, dynamicSizes, dynamicStrides;
+  dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets,
+                             ShapedType::kDynamicStrideOrOffset);
+  dispatchIndexOpFoldResults(sizes, dynamicSizes, staticSizes,
+                             ShapedType::kDynamicSize);
+  dispatchIndexOpFoldResults(strides, dynamicStrides, staticStrides,
+                             ShapedType::kDynamicStrideOrOffset);
+  build(b, result, {}, source, dest, dynamicOffsets, dynamicSizes,
+        dynamicStrides, b.getI64ArrayAttr(staticOffsets),
+        b.getI64ArrayAttr(staticSizes), b.getI64ArrayAttr(staticStrides));
+  result.addAttributes(attrs);
+}
+
+// Build a ParallelInsertSliceOp with dynamic entries.
+void ParallelInsertSliceOp::build(OpBuilder &b, OperationState &result,
+                                  Value source, Value dest, ValueRange offsets,
+                                  ValueRange sizes, ValueRange strides,
+                                  ArrayRef<NamedAttribute> attrs) {
+  SmallVector<OpFoldResult> offsetValues = llvm::to_vector<4>(
+      llvm::map_range(offsets, [](Value v) -> OpFoldResult { return v; }));
+  SmallVector<OpFoldResult> sizeValues = llvm::to_vector<4>(
+      llvm::map_range(sizes, [](Value v) -> OpFoldResult { return v; }));
+  SmallVector<OpFoldResult> strideValues = llvm::to_vector<4>(
+      llvm::map_range(strides, [](Value v) -> OpFoldResult { return v; }));
+  build(b, result, source, dest, offsetValues, sizeValues, strideValues);
+}
+
+namespace {
+/// Pattern to rewrite a parallel_insert_slice op with constant arguments.
+class ParallelInsertSliceOpConstantArgumentFolder final
+    : public OpRewritePattern<ParallelInsertSliceOp> {
+ public:
+  using OpRewritePattern<ParallelInsertSliceOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(ParallelInsertSliceOp insertSliceOp,
+                                PatternRewriter &rewriter) const override {
+    // No constant operand, just return.
+    if (llvm::none_of(insertSliceOp.getOperands(), [](Value operand) {
+          return matchPattern(operand, matchConstantIndex());
+        }))
+      return failure();
+
+    // At least one of offsets/sizes/strides is a new constant.
+    // Form the new list of operands and constant attributes from the
+    // existing.
+    SmallVector<OpFoldResult> mixedOffsets(insertSliceOp.getMixedOffsets());
+    SmallVector<OpFoldResult> mixedSizes(insertSliceOp.getMixedSizes());
+    SmallVector<OpFoldResult> mixedStrides(insertSliceOp.getMixedStrides());
+    canonicalizeSubViewPart(mixedOffsets, ShapedType::isDynamicStrideOrOffset);
+    canonicalizeSubViewPart(mixedSizes, ShapedType::isDynamic);
+    canonicalizeSubViewPart(mixedStrides, ShapedType::isDynamicStrideOrOffset);
+
+    // Create the new op in canonical form.
+    rewriter.replaceOpWithNewOp<ParallelInsertSliceOp>(
+        insertSliceOp, insertSliceOp.source(), insertSliceOp.dest(),
+        mixedOffsets, mixedSizes, mixedStrides);
+    return success();
+  }
+};
+}  // namespace
+
+void ParallelInsertSliceOp::getCanonicalizationPatterns(
+    RewritePatternSet &results, MLIRContext *context) {
+  results.add<ParallelInsertSliceOpConstantArgumentFolder>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// PerformConcurrentlyOp
+//===----------------------------------------------------------------------===//
+
+// TODO(ntv,apaszke): Implement this
+LogicalResult PerformConcurrentlyOp::verify() { return success(); }
+
+void PerformConcurrentlyOp::print(OpAsmPrinter &p) {
+  p << " ";
+  p.printRegion(region(),
+                /*printEntryBlockArgs=*/false,
+                /*printBlockTerminators=*/false);
+  p.printOptionalAttrDict(getOperation()->getAttrs());
+}
+
+ParseResult PerformConcurrentlyOp::parse(OpAsmParser &parser,
+                                         OperationState &result) {
+  auto &builder = parser.getBuilder();
+
+  SmallVector<OpAsmParser::OperandType, 8> regionOperands;
+  SmallVector<Type, 8> regionTypes;
+  std::unique_ptr<Region> region = std::make_unique<Region>();
+  if (parser.parseRegion(*region, regionOperands, regionTypes))
+    return failure();
+  PerformConcurrentlyOp::ensureTerminator(*region, builder, result.location);
+  result.addRegion(std::move(region));
+
+  // Parse the optional attribute list.
+  if (parser.parseOptionalAttrDict(result.attributes)) return failure();
+  return success();
+}
+
+SmallVector<Type> PerformConcurrentlyOp::yieldedTypes() {
+  return llvm::to_vector(llvm::map_range(
+      this->yieldingOps(),
+      [](ParallelInsertSliceOp op) { return op.yieldedType(); }));
+}
+
+SmallVector<ParallelInsertSliceOp> PerformConcurrentlyOp::yieldingOps() {
+  SmallVector<ParallelInsertSliceOp> ret;
+  for (Operation &op : *getBody()) {
+    // TODO: interface when this grows up.
+    if (auto sliceOp = llvm::dyn_cast<ParallelInsertSliceOp>(op)) {
+      ret.push_back(sliceOp);
+      continue;
+    }
+    if (auto endPerformOp = llvm::dyn_cast<EndPerformConcurrentlyOp>(op)) {
+      continue;
+    }
+    llvm_unreachable("Unexpected operation in perform_concurrently");
+  }
+  return ret;
+}
+
+//===----------------------------------------------------------------------===//
 // LinalgExtDialect
 //===----------------------------------------------------------------------===//
 
diff --git a/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Passes/CMakeLists.txt b/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Passes/CMakeLists.txt
new file mode 100644
index 0000000..e26003e
--- /dev/null
+++ b/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Passes/CMakeLists.txt
@@ -0,0 +1,25 @@
+add_mlir_library(IREELinalgExtPasses
+  ConvertToLoops.cpp
+  PadContractionToBlockSize.cpp
+  Passes.cpp
+  Tiling.cpp
+
+  DEPENDS
+  IREELinalgExtPassesIncGen
+
+  LINK_LIBS PUBLIC
+  IREEInputDialect
+  IREELinalgExtDialect
+  MLIRAffine
+  MLIRIR
+  MLIRLinalg
+  MLIRLinalgTransforms
+  MLIRMath
+  MLIRMemRef
+  MLIRPass
+  MLIRSCF
+  MLIRFunc
+  MLIRSupport
+  MLIRTensor
+  MLIRTransforms
+)
diff --git a/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Passes/ConvertToLoops.cpp b/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Passes/ConvertToLoops.cpp
new file mode 100644
index 0000000..da62126
--- /dev/null
+++ b/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Passes/ConvertToLoops.cpp
@@ -0,0 +1,115 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree-dialects/Dialect/LinalgExt/IR/LinalgExtDialect.h"
+#include "iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.h"
+#include "iree-dialects/Dialect/LinalgExt/Passes/PassDetail.h"
+#include "iree-dialects/Dialect/LinalgExt/Passes/Passes.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/SCF.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+using namespace mlir;
+namespace IREE = mlir::iree_compiler::IREE;
+using namespace IREE::LinalgExt;
+
+/// Recursive method that lowers one dimension of the `TiledOpInterface` to
+/// scalar loops at a time.
+static LogicalResult lowerToLoopsImpl(OpBuilder &builder,
+                                      TiledOpInterface tilableOp,
+                                      ArrayRef<Range> loopRanges,
+                                      unsigned loopDepth,
+                                      SmallVectorImpl<Value> &ivs) {
+  Location loc = tilableOp.getLoc();
+  if (loopDepth == loopRanges.size()) {
+    return tilableOp.generateScalarImplementation(builder, loc, ivs);
+  }
+  LogicalResult status = success();
+  builder.create<scf::ForOp>(
+      loc, loopRanges[loopDepth].offset, loopRanges[loopDepth].size,
+      loopRanges[loopDepth].stride, ValueRange{},
+      [&](OpBuilder &b, Location loc, Value iv, ValueRange args) {
+        ivs.push_back(iv);
+        status = lowerToLoopsImpl(b, tilableOp, loopRanges, loopDepth + 1, ivs);
+        b.create<scf::YieldOp>(loc);
+      });
+  return status;
+}
+
+/// Main entry point for lowering `TiledOpInterface` op to loops.
+static LogicalResult lowerToLoops(OpBuilder &builder,
+                                  TiledOpInterface tilableOp) {
+  SmallVector<Range> loopBounds = tilableOp.getIterationDomain(builder);
+  SmallVector<Value> ivs;
+  return lowerToLoopsImpl(builder, tilableOp, loopBounds, 0, ivs);
+}
+
+/// Pattern rewriter hook to lower a `TiledOpInterface` to loops.
+namespace {
+struct TiledOpInterfaceLowerToLoopsPattern : public RewritePattern {
+  TiledOpInterfaceLowerToLoopsPattern(MLIRContext *context,
+                                      PatternBenefit benefit = 1)
+      : RewritePattern(MatchAnyOpTypeTag(), benefit, context) {}
+
+  LogicalResult matchAndRewrite(Operation *op,
+                                PatternRewriter &rewriter) const override {
+    auto tilableOp = dyn_cast<TiledOpInterface>(op);
+    if (!tilableOp) {
+      return failure();
+    }
+    if (llvm::any_of(tilableOp->getResults(),
+                     [&](Value v) { return v.getType().isa<ShapedType>(); })) {
+      return rewriter.notifyMatchFailure(
+          tilableOp, "lower to loops needs to have tensor semantics");
+    }
+    if (failed(lowerToLoops(rewriter, tilableOp))) {
+      return failure();
+    }
+    rewriter.eraseOp(op);
+    return success();
+  }
+};
+}  // namespace
+
+//===----------------------------------------------------------------------===//
+// Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+struct LinalgExtToLoopsPass
+    : public LinalgExtToLoopsBase<LinalgExtToLoopsPass> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<linalg::LinalgDialect, func::FuncDialect,
+                    mlir::arith::ArithmeticDialect, math::MathDialect,
+                    memref::MemRefDialect, scf::SCFDialect>();
+  }
+
+  void runOnOperation() override {
+    MLIRContext *context = &getContext();
+
+    RewritePatternSet patterns(context);
+    patterns.insert<TiledOpInterfaceLowerToLoopsPattern>(context);
+    if (failed(applyPatternsAndFoldGreedily(getOperation(),
+                                            std::move(patterns)))) {
+      return signalPassFailure();
+    }
+  }
+};
+}  // namespace
+
+std::unique_ptr<OperationPass<FuncOp>>
+IREE::LinalgExt::createLinalgExtToLoopsPass() {
+  return std::make_unique<LinalgExtToLoopsPass>();
+}
diff --git a/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Passes/PadContractionToBlockSize.cpp b/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Passes/PadContractionToBlockSize.cpp
new file mode 100644
index 0000000..a2fe9bd
--- /dev/null
+++ b/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Passes/PadContractionToBlockSize.cpp
@@ -0,0 +1,140 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree-dialects/Dialect/Input/InputDialect.h"
+#include "iree-dialects/Dialect/Input/InputOps.h"
+#include "iree-dialects/Dialect/LinalgExt/Passes/PassDetail.h"
+#include "iree-dialects/Dialect/LinalgExt/Passes/Passes.h"
+#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Tensor/Utils/Utils.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+using namespace mlir;
+namespace IREE = mlir::iree_compiler::IREE;
+using namespace IREE::LinalgExt;
+
+static Operation *sliceTensor(Location loc, Value expanded, Value original,
+                              OpBuilder &builder) {
+  auto originalType = original.getType().cast<RankedTensorType>();
+  auto rank = originalType.getRank();
+  SmallVector<OpFoldResult> offsets(rank, builder.getI64IntegerAttr(0));
+  SmallVector<OpFoldResult> strides(rank, builder.getI64IntegerAttr(1));
+  SmallVector<OpFoldResult> sizes(rank);
+  for (int i = 0, e = rank; i < e; ++i) {
+    if (!originalType.isDynamicDim(i)) {
+      sizes[i] = builder.getI64IntegerAttr(originalType.getDimSize(i));
+    } else {
+      sizes[i] = builder.create<tensor::DimOp>(loc, original, i).getResult();
+    }
+  }
+
+  return builder.create<tensor::ExtractSliceOp>(loc, expanded, offsets, sizes,
+                                                strides);
+}
+
+static bool padTensor(Location loc, OpOperand *operand,
+                      ArrayRef<int64_t> alignments, OpBuilder &builder) {
+  Value original = operand->get();
+  auto type = original.getType().cast<RankedTensorType>();
+  ArrayRef<int64_t> shape = type.getShape();
+  assert(shape.size() == alignments.size() &&
+         "expected shape and alignments to match");
+
+  // New dimensions.
+  SmallVector<int64_t> newStaticDims;
+  newStaticDims.resize(shape.size(), -1);
+  SmallVector<OpFoldResult> newPaddingSizes(shape.size(),
+                                            builder.getI64IntegerAttr(0));
+
+  // Compute padded dims.
+  bool needsPad = false;
+  for (int i = 0, e = shape.size(); i < e; ++i) {
+    auto inputDim = shape[i];
+    auto alignment = alignments[i];
+    if (inputDim >= 0) {
+      // Static dim.
+      if ((inputDim % alignment) == 0) {
+        newStaticDims[i] = inputDim;
+        continue;
+      }
+      int64_t alignedDim = (inputDim + (alignment - 1)) & ~(alignment - 1);
+      newStaticDims[i] = alignedDim;
+      newPaddingSizes[i] = builder.getI64IntegerAttr(alignedDim - inputDim);
+      needsPad = true;
+    } else {
+      // Dynamic dim.
+      Value inputDimValue = builder.create<tensor::DimOp>(loc, original, i);
+      Value alignedDim =
+          builder.create<IREE::Input::AlignOp>(loc, inputDimValue, alignment);
+      newPaddingSizes[i] = alignedDim;
+      needsPad = true;
+    }
+  }
+  if (!needsPad) return false;
+
+  auto resultType = RankedTensorType::get(newStaticDims, type.getElementType());
+  Value zeroConstant = builder.create<arith::ConstantOp>(
+      loc, builder.getZeroAttr(type.getElementType()));
+  SmallVector<OpFoldResult> zeroStaticLow(shape.size(),
+                                          builder.getI64IntegerAttr(0));
+  SmallVector<Value> nullLow;
+  Value padded = tensor::createPadScalarOp(
+      resultType, operand->get(), zeroConstant, zeroStaticLow, newPaddingSizes,
+      false, loc, builder);
+  operand->set(padded);
+  return true;
+}
+
+namespace {
+
+struct PadContractionToBlockSizePass
+    : public PadContractionToBlockSizeBase<PadContractionToBlockSizePass> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<IREE::Input::IREEInputDialect>();
+  }
+
+  void runOnOperation() override {
+    getOperation()->walk([&](linalg::ContractionOpInterface op) {
+      auto linalgOp = llvm::cast<linalg::LinalgOp>(op.getOperation());
+      Location loc = op.getLoc();
+      OpOperand *lhs = linalgOp.getInputOperand(0);
+      OpOperand *rhs = linalgOp.getInputOperand(1);
+      OpOperand *output = linalgOp.getOutputOperand(0);
+      Value origOutput = output->get();
+      OpResult result = op.getOperation()->getResult(0);
+
+      bool insertSlice = false;
+      OpBuilder builder(op.getOperation());
+      if (op.isRowMajorMatmul()) {
+        padTensor(loc, lhs, {rowAlignment, rowAlignment}, builder);
+        padTensor(loc, rhs, {rowAlignment, columnAlignment}, builder);
+        if (padTensor(loc, output, {rowAlignment, columnAlignment}, builder)) {
+          result.setType(output->get().getType());
+          insertSlice = true;
+        }
+      }
+
+      // Insert an appropriate extract.
+      if (insertSlice) {
+        builder.setInsertionPointAfter(op.getOperation());
+        Operation *slicedResult = sliceTensor(loc, result, origOutput, builder);
+        result.replaceAllUsesExcept(slicedResult->getResult(0), slicedResult);
+      }
+
+      return WalkResult::advance();
+    });
+  }
+};
+}  // namespace
+
+std::unique_ptr<OperationPass<>>
+IREE::LinalgExt::createPadContractionToBlockSizePass() {
+  return std::make_unique<PadContractionToBlockSizePass>();
+}
diff --git a/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Passes/Passes.cpp b/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Passes/Passes.cpp
new file mode 100644
index 0000000..f038541
--- /dev/null
+++ b/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Passes/Passes.cpp
@@ -0,0 +1,33 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree-dialects/Dialect/LinalgExt/Passes/Passes.h"
+
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassRegistry.h"
+#include "mlir/Transforms/Passes.h"
+
+using namespace mlir;
+namespace IREE = mlir::iree_compiler::IREE;
+
+namespace mlir {
+namespace iree_compiler {
+namespace IREE {
+namespace LinalgExt {
+
+namespace detail {
+#define GEN_PASS_REGISTRATION
+#include "iree-dialects/Dialect/LinalgExt/Passes/Passes.h.inc"  // IWYU pragma: export
+}  // namespace detail
+
+}  // namespace LinalgExt
+}  // namespace IREE
+}  // namespace iree_compiler
+}  // namespace mlir
+
+void IREE::LinalgExt::registerPasses() {
+  IREE::LinalgExt::detail::registerPasses();
+}
diff --git a/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Passes/Tiling.cpp b/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Passes/Tiling.cpp
new file mode 100644
index 0000000..fd66bff
--- /dev/null
+++ b/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Passes/Tiling.cpp
@@ -0,0 +1,360 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree-dialects/Dialect/Input/InputDialect.h"
+#include "iree-dialects/Dialect/Input/InputOps.h"
+#include "iree-dialects/Dialect/LinalgExt/IR/LinalgExtDialect.h"
+#include "iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.h"
+#include "iree-dialects/Dialect/LinalgExt/Passes/PassDetail.h"
+#include "iree-dialects/Dialect/LinalgExt/Passes/Passes.h"
+#include "iree-dialects/Dialect/LinalgExt/Passes/Transforms.h"
+#include "llvm/ADT/TypeSwitch.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/SCF.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+using namespace mlir;
+namespace IREE = mlir::iree_compiler::IREE;
+using namespace IREE::LinalgExt;
+
+//===----------------------------------------------------------------------===//
+// Utility methods for tiling a linalg_ext operation that implements a
+// TiledOpInterface
+//===----------------------------------------------------------------------===//
+
+/// Returns failure if the options are unsupported.
+static LogicalResult verifySupportedTilingOptions(
+    PatternRewriter &rewriter, Operation *op,
+    const linalg::LinalgTilingOptions &options) {
+  if (!options.interchangeVector.empty()) {
+    return rewriter.notifyMatchFailure(op,
+                                       "unsupported interchange during tiling");
+  }
+  if (options.loopType != linalg::LinalgTilingLoopType::Loops) {
+    return rewriter.notifyMatchFailure(op,
+                                       "only tiling with scf.for is supported");
+  }
+  if (options.distribution) {
+    if (llvm::any_of(options.distribution->distributionMethod,
+                     [](linalg::DistributionMethod method) {
+                       return method != linalg::DistributionMethod::Cyclic;
+                     })) {
+      return rewriter.notifyMatchFailure(op,
+                                         "only cyclic distibution is allowed");
+    }
+  }
+  return success();
+}
+
+/// Converts an `OpFoldResult` to a `Value` by building a constant op if
+/// if the `OpFoldResult` is an `IntegerAttr`.
+static Value getValue(OpBuilder &builder, Location loc,
+                      OpFoldResult valueOrAttr) {
+  if (auto attr = valueOrAttr.dyn_cast<Attribute>()) {
+    return builder.create<arith::ConstantIndexOp>(
+        loc, attr.cast<IntegerAttr>().getInt());
+  }
+  return valueOrAttr.get<Value>();
+}
+
+/// Returns true if loop is untiled. Only checks if the value is statically
+/// zero. It is assumed that a `Value` defined by a constant op is already
+/// converted to an `IntegerAttr` of that value. So here just return true if
+/// this is an attribute with a zero value.
+static bool isUntiledLoop(OpFoldResult valueOrAttr) {
+  Optional<int64_t> intVal = getConstantIntValue(valueOrAttr);
+  return intVal && *intVal == 0;
+}
+
+/// Generates the tiled loops and the body by invoking the interface methods of
+/// TiledOpInterface.
+/// - `outputs` are the operands to use for outputs of the tiled operation.
+/// - `tileSizes` are tile sizes specified for all loops of the operation. If a
+///   loop is to be untiled it is set to 0.
+/// - `iteratorType` is the type of the loop iterator returned by the
+///   TiledOpInterface.
+/// - `loopBounds` are the bounds of all the loops of the op returned by the
+///   TiledOpInterface.
+/// - `loopDepth` is the current loop depth being processed.
+/// - `offsets` are the `Value`s that represent the position of the tile being
+///   operated on. The offsets are computed as the tiled loops are being
+///   generated.
+/// - `distributionInfo` is the proc_id and nprocs `Value`s to be used for
+///   distributed loops. It is a stack, and once an entry at the top of the
+///   stack is used for distribution it is popped before processing the inner
+///   loops.
+static FailureOr<TiledOp> tileInterfaceOpImpl(
+    OpBuilder &builder, TiledOpInterface tilableOp, ValueRange outputs,
+    MutableArrayRef<OpFoldResult> tileSizes, ArrayRef<StringRef> iteratorTypes,
+    ArrayRef<Range> loopBounds, unsigned loopDepth,
+    SmallVectorImpl<OpFoldResult> &offsets,
+    ArrayRef<linalg::ProcInfo> distributionInfo) {
+  Location loc = tilableOp.getLoc();
+  // If this is the innermost loop, then generated the tiled implementation of
+  // the op by invoking the TiledOpInterface methods.
+  if (loopDepth == tileSizes.size()) {
+    TiledOp ret;
+    ret.op = tilableOp.getTiledImplementation(builder, outputs, offsets,
+                                              tileSizes, ret.results);
+    if (!ret.op) {
+      return static_cast<LogicalResult>(
+          tilableOp.emitOpError("failed to get tiled implementation"));
+    }
+    return ret;
+  }
+
+  // If tile size at this depth is empty, do nothing.
+  if (isUntiledLoop(tileSizes[loopDepth])) {
+    auto zeroAttr = builder.getI64IntegerAttr(0);
+    offsets.push_back(zeroAttr);
+    assert(matchPattern(loopBounds[loopDepth].offset, m_Zero()) &&
+           "expected loop bounds to have lower bound of zero");
+    tileSizes[loopDepth] = getAsOpFoldResult(loopBounds[loopDepth].size);
+    return tileInterfaceOpImpl(builder, tilableOp, outputs, tileSizes,
+                               iteratorTypes, loopBounds, loopDepth + 1,
+                               offsets, distributionInfo);
+  }
+
+  // Generate an scf.for for the current loop depth.
+  Value lb = loopBounds[loopDepth].offset;
+  Value ub = loopBounds[loopDepth].size;
+  // TODO(#7073): Put the check back. This is required by tiling linalg_ext.fft
+  // op. We can put the check back after updating linalg_ext.fft semantics.
+  // if (!matchPattern(loopBounds[loopDepth].stride, m_One())) {
+  // return static_cast<LogicalResult>(
+  // tilableOp.emitOpError("expected stride to be 1"));
+  //}
+  Value step = getValue(builder, loc, tileSizes[loopDepth]);
+
+  // Update lb, ub and step for cyclic distribution.
+  if (!distributionInfo.empty() &&
+      iteratorTypes[loopDepth] == getParallelIteratorTypeName()) {
+    linalg::updateBoundsForCyclicDistribution(
+        builder, loc, distributionInfo.front().procId,
+        distributionInfo.front().nprocs, lb, ub, step);
+    distributionInfo = distributionInfo.drop_front();
+  }
+  FailureOr<TiledOp> innerReturnValue;
+  bool isBufferTiling = tilableOp->getNumResults() == 0;
+  ValueRange initValues(isBufferTiling ? ValueRange{} : outputs);
+  auto forOp = builder.create<scf::ForOp>(
+      loc, lb, ub, step, initValues,
+      [&](OpBuilder &b, Location loc, Value iv, ValueRange args) {
+        offsets.push_back(iv);
+        auto affineMaps = AffineMap::inferFromExprList({ArrayRef<AffineExpr>{
+            b.getAffineSymbolExpr(0),
+            b.getAffineSymbolExpr(1) - b.getAffineDimExpr(0)}})[0];
+        // Similar to linalg tiling, the tile size is the min(tileSizes, ub -
+        // iv) to account for cases where tile size does not divide (ub - lb)
+        // exactly.
+        Value inBoundsTileSize = b.create<AffineMinOp>(
+            loc, affineMaps,
+            ValueRange{iv, getValue(builder, loc, tileSizes[loopDepth]), ub});
+        tileSizes[loopDepth] = getAsOpFoldResult(inBoundsTileSize);
+        // Recursively proceed to generate the tiled loop for the next level.
+        innerReturnValue =
+            tileInterfaceOpImpl(b, tilableOp, (isBufferTiling ? outputs : args),
+                                tileSizes, iteratorTypes, loopBounds,
+                                loopDepth + 1, offsets, distributionInfo);
+        if (failed(innerReturnValue)) return;
+        b.create<scf::YieldOp>(loc, innerReturnValue->results);
+      });
+  if (failed(innerReturnValue)) {
+    return innerReturnValue;
+  }
+  innerReturnValue->loops.insert(innerReturnValue->loops.begin(),
+                                 forOp.getOperation());
+  innerReturnValue->results = forOp.getResults();
+  return innerReturnValue;
+}
+
+FailureOr<TiledOp> tileInterfaceOp(OpBuilder &b, TiledOpInterface tilableOp,
+                                   const linalg::LinalgTilingOptions &options) {
+  SmallVector<Value> dest = tilableOp.getDestinationOperands(b);
+  if (dest.empty()) {
+    return static_cast<LogicalResult>(tilableOp.emitOpError(
+        "cannot tile operation without destination operands"));
+  }
+
+  SmallVector<StringRef> iteratorTypes = tilableOp.getLoopIteratorTypes();
+  SmallVector<Value, 4> tileSizesVals =
+      options.tileSizeComputationFunction(b, tilableOp);
+  auto zeroAttr = b.getI64IntegerAttr(0);
+
+  // The actual tile sizes used converts `Value` defined as constant 0, to a
+  // zero integer attributes. Currently if the iterator type is not "parallel",
+  // the tile size is forced to zero as well.
+  auto tileSizes = getAsOpFoldResult(tileSizesVals);
+  tileSizes.resize(iteratorTypes.size(), zeroAttr);
+  for (auto en : llvm::enumerate(iteratorTypes)) {
+    if (en.value() == getParallelIteratorTypeName()) continue;
+    if (!isUntiledLoop(tileSizes[en.index()])) {
+      return static_cast<LogicalResult>(tilableOp.emitOpError(
+          "unimplemented tiling of non-parallel loop iterator type"));
+    }
+  }
+
+  // Trivial early exit case of tile sizes being zero for all parallel loops.
+  if (llvm::all_of(tileSizes, isUntiledLoop)) {
+    return TiledOp{tilableOp, {}, {}};
+  }
+
+  SmallVector<Range> loopBounds = tilableOp.getIterationDomain(b);
+  SmallVector<linalg::ProcInfo> distributionInfo;
+  // If the tiled loops are distributed, get the proc_id and nprocs for the
+  // distributed loops. First collect the parallel loops by iterating over the
+  // tileSizes and getting the loops that are distribute, i.e.,
+  // - parallel, i.e. iteratorTypes is "parallel"
+  // - tiled, i.e. tileSize != 0
+  if (options.distribution) {
+    SmallVector<Range> distributedLoopRange;
+    for (auto i : llvm::seq<unsigned>(0, tileSizes.size())) {
+      if (isUntiledLoop(tileSizes[i])) continue;
+      if (iteratorTypes[i] != getParallelIteratorTypeName()) continue;
+      distributedLoopRange.push_back(loopBounds[i]);
+    }
+    distributionInfo = options.distribution->procInfo(b, tilableOp.getLoc(),
+                                                      distributedLoopRange);
+  }
+
+  SmallVector<OpFoldResult> offsets;
+  return tileInterfaceOpImpl(b, tilableOp, dest, tileSizes, iteratorTypes,
+                             loopBounds, 0, offsets, distributionInfo);
+}
+
+LogicalResult TiledOpInterfaceBaseTilingPattern::matchAndRewriteBase(
+    TiledOpInterface tilableOp, PatternRewriter &rewriter,
+    TiledOp &result) const {
+  if (failed(filter.checkAndNotify(rewriter, tilableOp))) {
+    return failure();
+  }
+  if (failed(verifySupportedTilingOptions(rewriter, tilableOp, options))) {
+    return failure();
+  }
+
+  FailureOr<TiledOp> res = tileInterfaceOp(rewriter, tilableOp, options);
+  if (failed(res)) return res;
+  result = *res;
+  if (result.op) {
+    filter.replaceLinalgTransformationFilter(rewriter, result.op);
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Test pass for tiling Linalg Ext ops
+//===----------------------------------------------------------------------===//
+
+namespace {
+struct TiledOpInterfaceTilingPass
+    : public TiledOpInterfaceTilingBase<TiledOpInterfaceTilingPass> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<
+        AffineDialect, IREE::Input::IREEInputDialect, linalg::LinalgDialect,
+        IREE::LinalgExt::IREELinalgExtDialect, memref::MemRefDialect,
+        func::FuncDialect, mlir::arith::ArithmeticDialect, math::MathDialect,
+        tensor::TensorDialect, scf::SCFDialect>();
+  }
+  void runOnOperation() override;
+};
+}  // namespace
+
+template <typename OpTy>
+static Value buildFlowWorkgroupInfoOp(OpBuilder &b, unsigned dim) {
+  return b.template create<OpTy>(b.getInsertionPoint()->getLoc(), dim);
+}
+
+void TiledOpInterfaceTilingPass::runOnOperation() {
+  FuncOp funcOp = getOperation();
+  MLIRContext *context = funcOp.getContext();
+
+  RewritePatternSet patterns(context);
+  patterns.add<TiledOpInterfaceTilingPattern>(
+      context, linalg::LinalgTilingOptions().setTileSizes({10, 20}),
+      linalg::LinalgTransformationFilter(
+          StringAttr::get(context, "tiling_input"),
+          StringAttr::get(context, "tiling_output")));
+  patterns.add<TiledOpInterfaceTilingPattern>(
+      context, linalg::LinalgTilingOptions().setTileSizes(ArrayRef<int64_t>{0}),
+      linalg::LinalgTransformationFilter(
+          StringAttr::get(context, "no_tiling_input"),
+          StringAttr::get(context, "no_tiling_output")));
+
+  patterns.add<TiledOpInterfaceTilingPattern>(
+      context, linalg::LinalgTilingOptions().setTileSizes({0, 20}),
+      linalg::LinalgTransformationFilter(
+          StringAttr::get(context, "outer_reduce_input"),
+          StringAttr::get(context, "outer_reduce_output")));
+  patterns.add<TiledOpInterfaceTilingPattern>(
+      context, linalg::LinalgTilingOptions().setTileSizes({10, 0, 0}),
+      linalg::LinalgTransformationFilter(
+          StringAttr::get(context, "inner_reduce_input"),
+          StringAttr::get(context, "inner_reduce_output")));
+
+  static linalg::LinalgLoopDistributionOptions workgroupDistributionOptions = {
+      [](OpBuilder &builder, Location loc, ArrayRef<Range> parallelLoopRanges) {
+        auto numParallelDims = parallelLoopRanges.size();
+
+        SmallVector<linalg::ProcInfo, 3> procInfo(numParallelDims);
+        for (size_t dim = 0; dim < numParallelDims; ++dim) {
+          procInfo[numParallelDims - dim - 1] = {
+              buildFlowWorkgroupInfoOp<IREE::Input::DispatchWorkgroupIDOp>(
+                  builder, dim),
+              buildFlowWorkgroupInfoOp<IREE::Input::DispatchWorkgroupCountOp>(
+                  builder, dim)};
+        }
+        return procInfo;
+      },
+      {linalg::DistributionMethod::Cyclic, linalg::DistributionMethod::Cyclic,
+       linalg::DistributionMethod::Cyclic},
+      DenseMap<StringRef,
+               std::function<linalg::ProcInfo(OpBuilder &, Location)>>()};
+
+  patterns.add<TiledOpInterfaceTilingPattern>(
+      context,
+      linalg::LinalgTilingOptions()
+          .setTileSizes(ArrayRef<int64_t>{10, 0, 30})
+          .setDistributionOptions(workgroupDistributionOptions),
+      linalg::LinalgTransformationFilter(
+          StringAttr::get(context, "distribute_input"),
+          StringAttr::get(context, "distribute_output")));
+
+  patterns.add<TiledOpInterfaceTilingPattern>(
+      context,
+      linalg::LinalgTilingOptions().setTileSizes(ArrayRef<int64_t>{32}),
+      linalg::LinalgTransformationFilter(
+          StringAttr::get(context, "tiling_1d_stage5_fft_input"),
+          StringAttr::get(context, "tiling_1d_stage5_fft_output")));
+
+  patterns.add<TiledOpInterfaceTilingPattern>(
+      context,
+      linalg::LinalgTilingOptions().setTileSizes(ArrayRef<int64_t>{10, 32}),
+      linalg::LinalgTransformationFilter(
+          StringAttr::get(context, "tiling_2d_stage5_fft_input"),
+          StringAttr::get(context, "tiling_2d_stage5_fft_output")));
+
+  patterns.add<TiledOpInterfaceTilingPattern>(
+      context, linalg::LinalgTilingOptions().setTileSizes({0, 20}),
+      linalg::LinalgTransformationFilter(
+          StringAttr::get(context, "tiling_repeated_indices_scatter_input"),
+          StringAttr::get(context, "tiling_repeated_indices_scatter_output")));
+
+  if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns)))) {
+    return signalPassFailure();
+  }
+}
+
+std::unique_ptr<OperationPass<FuncOp>>
+IREE::LinalgExt::createTiledOpInterfaceTilingPass() {
+  return std::make_unique<TiledOpInterfaceTilingPass>();
+}
diff --git a/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Transforms/CMakeLists.txt b/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Transforms/CMakeLists.txt
index 0cd7fd0..a174ba1 100644
--- a/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Transforms/CMakeLists.txt
+++ b/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Transforms/CMakeLists.txt
@@ -1,25 +1,44 @@
-add_mlir_library(IREELinalgExtPasses
-  ConvertToLoops.cpp
-  PadContractionToBlockSize.cpp
-  Passes.cpp
+add_mlir_library(IREELinalgExtTransforms
+  InParallelToAsync.cpp
+  InParallelToSequentialFor.cpp
+  TilingExternalModels.cpp
+  TileToSequentialFor.cpp
+  TileToInParallel.cpp
   Tiling.cpp
+  TilingToTileOp.cpp
+  Utils.cpp
 
+  PARTIAL_SOURCES_INTENDED
   DEPENDS
-  IREELinalgExtTransformsPassesIncGen
+  mlir-headers
+  IREELinalgExtDialect
 
   LINK_LIBS PUBLIC
-  IREEInputDialect
   IREELinalgExtDialect
-  MLIRAffine
+
+  MLIRAffineToStandard
+  MLIRAsync
+  MLIRSCFToControlFlow
+  MLIRLinalgToLLVM
+  MLIRVectorToLLVM
+  MLIRMathToLLVM
+  MLIRMemRefToLLVM
   MLIRIR
+  MLIRMath
   MLIRLinalg
   MLIRLinalgTransforms
-  MLIRMath
-  MLIRMemRef
   MLIRPass
   MLIRSCF
-  MLIRFunc
-  MLIRSupport
-  MLIRTensor
   MLIRTransforms
 )
+
+add_mlir_library(IREELinalgExtOpInterfaceImpl
+  LinalgExtBufferization.cpp
+
+  PARTIAL_SOURCES_INTENDED
+  LINK_LIBS PUBLIC
+  IREELinalgExtDialect
+
+  MLIRBufferization
+  MLIRTensorTransforms
+)
diff --git a/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Transforms/InParallelToAsync.cpp b/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Transforms/InParallelToAsync.cpp
new file mode 100644
index 0000000..64514bb
--- /dev/null
+++ b/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Transforms/InParallelToAsync.cpp
@@ -0,0 +1,91 @@
+//===- InParallelToAsync.cpp - Rewrite InParallel as Async ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <cstdlib>
+
+#include "iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.h"
+#include "iree-dialects/Dialect/LinalgExt/Transforms/Transforms.h"
+#include "iree-dialects/Dialect/LinalgExt/Transforms/Utils.h"
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
+#include "mlir/Dialect/Async/IR/Async.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/SCF/SCF.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+using namespace mlir;
+using namespace mlir::iree_compiler::IREE::LinalgExt;
+
+FailureOr<Operation *> mlir::iree_compiler::IREE::LinalgExt::
+    InParallelOpToAsyncRewriter::returningMatchAndRewrite(
+        iree_compiler::IREE::LinalgExt::InParallelOp inParallelOp,
+        PatternRewriter &rewriter) const {
+  assert(inParallelOp.getNumResults() == 0 &&
+         "expected bufferized InParallelOp");
+
+  // Only consider the top level InParallelOp op and skip if it already
+  // contains an ExecuteOp.
+  if (inParallelOp
+          ->getParentOfType<iree_compiler::IREE::LinalgExt::InParallelOp>() ||
+      llvm::any_of(inParallelOp.getBody()->getOperations(),
+                   [](Operation &op) { return isa<async::ExecuteOp>(&op); }))
+    return failure();
+
+  auto *ctx = inParallelOp.getContext();
+  Location loc = inParallelOp.getLoc();
+  Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+  Value one = rewriter.create<arith::ConstantIndexOp>(loc, 1);
+  Value numThreads = inParallelOp.num_threads();
+
+  // Wrap the linalg_ext.in_parallel into an async::ExecuteOp.
+  // 1. Create the async::GroupType object on which we synchronize.
+  Value asyncGroup = rewriter.create<async::CreateGroupOp>(
+      loc, async::GroupType::get(ctx), numThreads);
+
+  // 2. Create a bodyless forOp.
+  scf::ForOp forOp = rewriter.create<scf::ForOp>(loc, zero, numThreads, one);
+  rewriter.setInsertionPointToStart(forOp.getBody());
+
+  // 3. Create an empty executeOp, nested within the forOp.
+  auto noopExec = [&](OpBuilder &executeBuilder, Location executeLoc,
+                      ValueRange executeArgs) {};
+  auto executeOp =
+      rewriter.create<async::ExecuteOp>(loc, /*resultTypes=*/TypeRange(),
+                                        /*dependencies=*/ValueRange(),
+                                        /*operands=*/ValueRange(), noopExec);
+
+  // 3. Steal the iree_compiler::IREE::LinalgExt::InParallel ops, except the
+  // terminator, into the body of the async::ExecuteOp, just before the
+  // terminator.
+  SmallVector<Value> bbArgsTranslated{forOp.getInductionVar()};
+  rewriter.mergeBlocks(&inParallelOp.region().front(), executeOp.getBody(),
+                       bbArgsTranslated);
+  // 3.b. Erase the terminator stolen from inParallelOp.
+  rewriter.eraseOp(&executeOp.getBody()->back());
+  // 3.c. Erase inParallelOp.
+  rewriter.eraseOp(inParallelOp);
+  // 3.d. Add ExecuteOp terminator.
+  rewriter.setInsertionPointToEnd(executeOp.getBody());
+  rewriter.create<async::YieldOp>(loc, ValueRange{});
+  // 3.e. Add to group within the loop.
+  rewriter.setInsertionPoint(forOp.getBody()->getTerminator());
+  rewriter.create<async::AddToGroupOp>(loc, rewriter.getIndexType(),
+                                       executeOp.token(), asyncGroup);
+
+  // 4. After the iree_compiler::IREE::LinalgExt::InParallel, await all async
+  // tasks in `asyncGroup`.
+  rewriter.setInsertionPointAfter(forOp);
+  return rewriter.create<async::AwaitAllOp>(loc, asyncGroup).getOperation();
+}
diff --git a/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Transforms/InParallelToSequentialFor.cpp b/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Transforms/InParallelToSequentialFor.cpp
new file mode 100644
index 0000000..683629b
--- /dev/null
+++ b/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Transforms/InParallelToSequentialFor.cpp
@@ -0,0 +1,111 @@
+//===- InParallelToSequentialFor.cpp.cpp - Rewrite InParallel as ForOp ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+
+#include "iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.h"
+#include "iree-dialects/Dialect/LinalgExt/Transforms/Transforms.h"
+#include "iree-dialects/Dialect/LinalgExt/Transforms/Utils.h"
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/SCF/SCF.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+using namespace mlir;
+using namespace mlir::iree_compiler::IREE::LinalgExt;
+
+namespace {
+
+SmallVector<Value> getValuesToYield(PerformConcurrentlyOp op) {
+  return llvm::to_vector(llvm::map_range(
+      op.yieldingOps(), [](ParallelInsertSliceOp op) { return op.dest(); }));
+}
+
+}  // namespace
+
+FailureOr<scf::ForOp> InParallelOpToScfForRewriter::returningMatchAndRewrite(
+    InParallelOp inParallelOp, PatternRewriter &rewriter) const {
+  // Construct the loop bounds based on the canonical arithmetic progression.
+  Location loc = inParallelOp.getLoc();
+  Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+  Value one = rewriter.create<arith::ConstantIndexOp>(loc, 1);
+  Value numThreads = inParallelOp.num_threads();
+
+  // Construct the op without a body builder: we need to clone the ops in the
+  // body explicitly after having access to the new bbArgs.
+  // As a consequence, `ensureTerminator` is not called and the `forOp` body
+  // has no terminator.
+  PerformConcurrentlyOp performConcurrentlyOp = inParallelOp.getTerminator();
+  SmallVector<Value> valuesToYield = getValuesToYield(performConcurrentlyOp);
+  scf::ForOp forOp =
+      rewriter.create<scf::ForOp>(loc, zero, numThreads, one, valuesToYield);
+
+  // Move the body while replacing the threadId by the forOp iv.
+  SmallVector<Value> bbArgsTranslated{forOp.getInductionVar()};
+  Block *body = forOp.getBody();
+  bool hasTerminator =
+      !body->empty() && body->back().hasTrait<OpTrait::IsTerminator>();
+  if (hasTerminator) {
+    rewriter.mergeBlockBefore(&inParallelOp.region().front(),
+                              body->getTerminator(), bbArgsTranslated);
+  } else {
+    rewriter.mergeBlocks(&inParallelOp.region().front(), body,
+                         bbArgsTranslated);
+  }
+
+  rewriter.setInsertionPointToStart(body);
+  BlockAndValueMapping bvm;
+  bvm.map(valuesToYield, forOp.getRegionIterArgs());
+
+  // Create sequential insertSlice ops.
+  SmallVector<Value> toYield;
+  rewriter.setInsertionPoint(performConcurrentlyOp);
+  for (ParallelInsertSliceOp op : performConcurrentlyOp.yieldingOps()) {
+    toYield.push_back(rewriter.createOrFold<tensor::InsertSliceOp>(
+        loc, op.source(), bvm.lookup(op.dest()), op.getMixedOffsets(),
+        op.getMixedSizes(), op.getMixedStrides()));
+  }
+
+  // performConcurrentlyOp.yieldedValues come from above, not from bbArgs.
+  // There is no rewriter method to make mergeBlocks update non-bbArgs.
+  // Need to manually clone + bvm all uses that are now nested under forOp.
+  // Warning: this replacement is currently optimistic and may change the
+  // semantics as explained in the pass description in Passes.td.
+  SmallVector<Operation *> opsToReplace;
+  for (Value toReplace : valuesToYield) {
+    for (OpOperand &u : toReplace.getUses()) {
+      Operation *op = u.getOwner();
+      if (!forOp->isProperAncestor(op)) continue;
+      opsToReplace.push_back(op);
+    }
+  }
+  for (Operation *op : opsToReplace) {
+    OpBuilder::InsertionGuard g(rewriter);
+    rewriter.setInsertionPoint(op);
+    Operation *cloned = rewriter.clone(*op, bvm);
+    rewriter.replaceOp(op, cloned->getResults());
+  }
+
+  // Insert terminator.
+  if (!hasTerminator) {
+    rewriter.setInsertionPointToEnd(body);
+    rewriter.create<scf::YieldOp>(loc, toYield);
+  }
+
+  // Cleanup and replace.
+  rewriter.eraseOp(performConcurrentlyOp);
+  rewriter.replaceOp(inParallelOp, forOp.getResults());
+
+  return forOp;
+}
diff --git a/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Transforms/LinalgExtBufferization.cpp b/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Transforms/LinalgExtBufferization.cpp
new file mode 100644
index 0000000..6a03048
--- /dev/null
+++ b/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Transforms/LinalgExtBufferization.cpp
@@ -0,0 +1,347 @@
+//===-- LinalgExtBufferization.cpp - Linalg Extension bufferization -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "iree-dialects/Dialect/LinalgExt/LinalgExtBufferization.h"
+
+#include <mlir/IR/BuiltinOps.h>
+
+#include "iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.h"
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/IR/PatternMatch.h"
+
+using namespace mlir;
+using namespace mlir::iree_compiler::IREE::LinalgExt;
+
+/// Return the destinations that an InParallelOp is inserting into. One per
+/// ParallelInsertSliceOp.
+static SmallVector<OpOperand *> getInsertionDest(InParallelOp inParallelOp) {
+  Operation *terminator = inParallelOp.region().front().getTerminator();
+  auto performConcOp = dyn_cast<PerformConcurrentlyOp>(terminator);
+  assert(performConcOp && "expected PerformConcurrentlyOp as terminator");
+
+  SmallVector<OpOperand *> result;
+  performConcOp.walk([&](ParallelInsertSliceOp insertOp) {
+    result.push_back(&insertOp->getOpOperand(1) /*dest*/);
+  });
+
+  return result;
+}
+
+namespace mlir {
+
+using bufferization::BufferizableOpInterface;
+using bufferization::BufferizationState;
+using bufferization::BufferRelation;
+using bufferization::getMemRefType;
+using bufferization::replaceOpWithBufferizedValues;
+using bufferization::replaceOpWithNewBufferizedOp;
+using tensor::ExtractSliceOp;
+
+namespace iree_compiler {
+namespace IREE {
+namespace LinalgExt {
+
+/// Bufferization of InParallelOp. This also bufferizes the terminator of the
+/// region. There are op interfaces for the terminators (PerformConcurrentlyOp
+/// and ParallelInsertSliceOp), but these are only used during analysis. Not
+/// for bufferization.
+struct InParallelOpInterface
+    : public BufferizableOpInterface::ExternalModel<InParallelOpInterface,
+                                                    InParallelOp> {
+  SmallVector<OpOperand *> getAliasingOpOperand(
+      Operation *op, OpResult opResult, const BufferizationState &state) const {
+    // Get OpOperand (dest) from corresponding ParallelInsertSliceOp.
+    auto inParallelOp = cast<InParallelOp>(op);
+    return {getInsertionDest(inParallelOp)[opResult.getResultNumber()]};
+  }
+
+  bool isMemoryWrite(Operation *op, OpResult opResult,
+                     const BufferizationState &state) const {
+    // This op is a memory write. Stop lookup here to avoid finding false
+    // conflicts involving this op and one of the ops in the region. This is
+    // similar to how scf.if ops are analyzed.
+    return true;
+  }
+
+  bool isAllocationHoistingBarrier(Operation *op) const { return true; }
+
+  BufferRelation bufferRelation(Operation *op, OpResult opResult,
+                                const BufferizationState &state) const {
+    return BufferRelation::Equivalent;
+  }
+
+  LogicalResult bufferize(Operation *op, RewriterBase &b,
+                          const BufferizationState &state) const {
+    OpBuilder::InsertionGuard g(b);
+    auto inParallelOp = cast<InParallelOp>(op);
+    Block *body = &inParallelOp.region().front();
+    Operation *oldTerminator = body->getTerminator();
+    assert(isa<PerformConcurrentlyOp>(oldTerminator) &&
+           "unexpected terminator");
+
+    // Gather new results of the InParallelOp.
+    SmallVector<Value> newResults;
+    for (OpResult opResult : inParallelOp->getOpResults()) {
+      SmallVector<OpOperand *> insertDestOperands =
+          state.getAliasingOpOperand(opResult);
+      assert(insertDestOperands.size() == 1 &&
+             "expected exactly one aliasing OpOperand");
+      // Insert copies right before the PerformConcurrentlyOp terminator. They
+      // should not be inside terminator (which would be the default insertion
+      // point).
+      Value buffer = *state.getBuffer(
+          b, *insertDestOperands.front(), /*forceInPlace=*/false,
+          /*customCopyInsertionPoint=*/oldTerminator);
+      newResults.push_back(buffer);
+      Value destTensor = insertDestOperands.front()->get();
+
+      // Replace all uses of the insert dest tensor inside the InParallelOp
+      // with the result buffer.
+      OpBuilder::InsertionGuard g(b);
+      b.setInsertionPointToStart(body);
+      Value toTensorOp =
+          b.create<bufferization::ToTensorOp>(inParallelOp.getLoc(), buffer);
+      for (OpOperand &use : destTensor.getUses())
+        if (body->findAncestorOpInBlock(*use.getOwner()))
+          // This is a use inside the InParallelOp.
+          use.set(toTensorOp);
+    }
+
+    // Create new InParallelOp without any results.
+    TypeRange newResultTypes;
+    auto newInParallelOp = b.create<InParallelOp>(
+        inParallelOp.getLoc(), newResultTypes, inParallelOp.num_threads());
+
+    // Delete terminator.
+    newInParallelOp.getBody()->getTerminator()->erase();
+
+    // Move over block contents of the old op.
+    b.mergeBlocks(inParallelOp.getBody(), newInParallelOp.getBody(),
+                  {newInParallelOp.getBody()->getArgument(0)});
+
+    // Bufferize terminator.
+    auto performConcurrentlyOp =
+        cast<PerformConcurrentlyOp>(newInParallelOp.getBody()->getTerminator());
+    b.setInsertionPoint(performConcurrentlyOp);
+    WalkResult walkResult =
+        performConcurrentlyOp.walk([&](ParallelInsertSliceOp insertOp) {
+          Location loc = insertOp.getLoc();
+          Type srcType = getMemRefType(
+              insertOp.source().getType().cast<RankedTensorType>(),
+              state.getOptions());
+          Type destType =
+              getMemRefType(insertOp.dest().getType().cast<RankedTensorType>(),
+                            state.getOptions());
+          // ParallelInsertSliceOp bufferizes to a copy.
+          auto srcMemref = b.create<bufferization::ToMemrefOp>(
+              loc, srcType, insertOp.source());
+          auto destMemref = b.create<bufferization::ToMemrefOp>(
+              loc, destType, insertOp.dest());
+          Value subview = b.create<memref::SubViewOp>(
+              loc, destMemref, insertOp.getMixedOffsets(),
+              insertOp.getMixedSizes(), insertOp.getMixedStrides());
+          // This memcpy will fold away if everything bufferizes in-place.
+          if (failed(createMemCpy(b, insertOp.getLoc(), srcMemref, subview,
+                                  state.getOptions())))
+            return WalkResult::interrupt();
+          b.eraseOp(insertOp);
+          return WalkResult::advance();
+        });
+    if (walkResult.wasInterrupted()) return failure();
+
+    // Replace the op.
+    replaceOpWithBufferizedValues(b, op, newResults);
+
+    return success();
+  }
+};
+
+/// Nothing to do for PerformConcurrentlyOp.
+struct PerformConcurrentlyOpInterface
+    : public BufferizableOpInterface::ExternalModel<
+          PerformConcurrentlyOpInterface, PerformConcurrentlyOp> {
+  LogicalResult bufferize(Operation *op, RewriterBase &b,
+                          const BufferizationState &state) const {
+    llvm_unreachable("op does not have any tensor OpOperands / OpResults");
+    return failure();
+  }
+};
+
+/// Return true if the (ExtractSliceOp, ParallelInsertSliceOp) pair match (i.e.
+/// equivalent operand / result and same offset/sizes/strides specification).
+static bool areEquivalentExtractSliceOps(const BufferizationState &state,
+                                         ExtractSliceOp st,
+                                         ParallelInsertSliceOp sti) {
+  if (!st || !sti) return false;
+  if (st != sti &&
+      !state.areEquivalentBufferizedValues(st.source(), sti.dest()))
+    return false;
+  if (!sameOffsetsSizesAndStrides(st, sti, isEqualConstantIntOrValue))
+    return false;
+  return true;
+}
+
+/// Return true if `value` is originating from an ExtractSliceOp that matches
+/// the given InsertSliceOp.
+static bool hasMatchingExtractSliceOp(const BufferizationState &state,
+                                      Value value,
+                                      ParallelInsertSliceOp insertOp) {
+  auto condition = [&](Value val) {
+    if (auto extractOp = val.getDefiningOp<ExtractSliceOp>())
+      if (areEquivalentExtractSliceOps(state, extractOp, insertOp)) return true;
+    return false;
+  };
+
+  return llvm::all_of(state.findValueInReverseUseDefChain(value, condition),
+                      condition);
+}
+
+/// Analysis of ParallelInsertSliceOp.
+struct ParallelInsertSliceOpInterface
+    : public BufferizableOpInterface::ExternalModel<
+          ParallelInsertSliceOpInterface, ParallelInsertSliceOp> {
+  SmallVector<OpResult> getAliasingOpResult(
+      Operation *op, OpOperand &opOperand,
+      const BufferizationState &state) const {
+    if (&opOperand != &op->getOpOperand(1) /*dest*/) return {};
+
+    // ParallelInsertSliceOp itself has no results. Tensors are returned via
+    // the parent op.
+    auto inParallelOp = op->getParentOfType<InParallelOp>();
+    assert(inParallelOp &&
+           "could not find valid owner of parallel_insert_slice");
+
+    // The i-th ParallelInsertSliceOp result is returned via the i-th OpResult
+    // of the parent InParallelOp.
+    Block *block = op->getBlock();
+    unsigned int opIdx = 0;
+    for (ParallelInsertSliceOp insertOp :
+         block->getOps<ParallelInsertSliceOp>()) {
+      if (insertOp.getOperation() == op) break;
+      ++opIdx;
+    }
+    assert(opIdx < inParallelOp->getNumResults() &&
+           "could not find op inside terminator op");
+
+    return {inParallelOp->getResult(opIdx)};
+  }
+
+  bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
+                              const BufferizationState &state) const {
+    return true;
+  }
+
+  bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
+                               const BufferizationState &state) const {
+    return &opOperand == &op->getOpOperand(1) /*dest*/;
+  }
+
+  BufferRelation bufferRelation(Operation *op, OpResult opResult,
+                                const BufferizationState &state) const {
+    return BufferRelation::Equivalent;
+  }
+
+  LogicalResult bufferize(Operation *op, RewriterBase &b,
+                          const BufferizationState &state) const {
+    // Will be bufferized as part of InParallelOp.
+    return failure();
+  }
+
+  // TODO: This is copied from TensorInterfaceImpl.cpp. Find a way to share
+  // the code.
+  bool isNotConflicting(Operation *op, OpOperand *uRead,
+                        OpOperand *uConflictingWrite,
+                        const BufferizationState &state) const {
+    Operation *readingOp = uRead->getOwner();
+    Operation *conflictingWritingOp = uConflictingWrite->getOwner();
+
+    // Special rules for matching ExtractSliceOp/InsertSliceOp pairs. If
+    // uRead is an InsertSliceOp...
+    if (auto insertSliceOp = dyn_cast<ParallelInsertSliceOp>(readingOp)) {
+      // As an example, consider the following IR.
+      //
+      // %0 = tensor.extract_slice %t[%a, %b][%c, %d][1, 1] {inplace = [true] }
+      // %1 = linalg.fill %cst, %0 {inplace= [true] }
+      // %2 = tensor.insert_slice %1 into %t[%a, %b][%c, %d][1, 1]
+      //     {inplace= [true] }
+
+      // TODO: Use insertSliceOp.getDestOpOperand etc. when available.
+      if (uRead == &insertSliceOp->getOpOperand(1) /*dest*/ &&
+          hasMatchingExtractSliceOp(state, uConflictingWrite->get(),
+                                    insertSliceOp))
+        // Case 1: The main insight is that InsertSliceOp reads only part of
+        // the destination tensor. The overwritten area is not read. If
+        // uConflictingWrite writes into exactly the memory location that is
+        // being read by uRead, this is not a conflict.
+        //
+        // In the above example:
+        // uRead             = OpOperand 1 (%t) of tensor.insert_slice
+        // uConflictingWrite = OpOperand 1 (%0) of linalg.fill
+        //
+        // The read of %t does not conflict with the write of the FillOp
+        // (same aliases!) because the area that the FillOp operates on is
+        // exactly the one that is *not* read via %t.
+        return true;
+
+      if (uRead == &insertSliceOp->getOpOperand(0) /*source*/ &&
+          uConflictingWrite == &insertSliceOp->getOpOperand(1) /*dest*/ &&
+          hasMatchingExtractSliceOp(state, uRead->get(), insertSliceOp))
+        // Case 2: The read of the source tensor and the write to the dest
+        // tensor via an InsertSliceOp is not a conflict if the read is
+        // reading exactly that part of an equivalent tensor that the
+        // InsertSliceOp is writing.
+        //
+        // In the above example:
+        // uRead             = OpOperand 0 (%1) of tensor.insert_slice
+        // uConflictingWrite = OpOperand 1 (%t) of tensor.insert_slice
+        return true;
+    }
+
+    // If uConflictingWrite is an InsertSliceOp...
+    if (auto insertSliceOp =
+            dyn_cast<ParallelInsertSliceOp>(conflictingWritingOp))
+      // As an example, consider the following IR.
+      //
+      // %0 = tensor.extract_slice %t[%a, %b][%c, %d][1, 1] {inplace = [true] }
+      // %1 = linalg.fill %cst, %0 {inplace= [true] }
+      // %2 = tensor.insert_slice %1 into %t[%a, %b][%c, %d][1, 1]
+      //     {inplace= [true] }
+      // %3 = vector.transfer_read %1, %cst
+      //
+      // In the above example:
+      // uRead             = OpOperand 0 (%1) of vector.transfer_read
+      // uConflictingWrite = OpOperand 1 (%t) of tensor.insert_slice
+      // lastWrite         = %1
+      //
+      // This is not a conflict because the InsertSliceOp overwrites the
+      // memory segment of %1 with the exact same data. (Effectively, there
+      // is no memory write here.)
+      if (uConflictingWrite == &insertSliceOp->getOpOperand(1) /*dest*/ &&
+          state.areEquivalentBufferizedValues(uRead->get(),
+                                              insertSliceOp.source()) &&
+          hasMatchingExtractSliceOp(state, insertSliceOp.source(),
+                                    insertSliceOp))
+        return true;
+
+    return false;
+  }
+};
+}  // namespace LinalgExt
+}  // namespace IREE
+}  // namespace iree_compiler
+}  // namespace mlir
+
+void mlir::iree_compiler::IREE::LinalgExt::
+    registerBufferizableOpInterfaceExternalModels(DialectRegistry &registry) {
+  registry.addOpInterface<InParallelOp, InParallelOpInterface>();
+  registry
+      .addOpInterface<PerformConcurrentlyOp, PerformConcurrentlyOpInterface>();
+  registry
+      .addOpInterface<ParallelInsertSliceOp, ParallelInsertSliceOpInterface>();
+}
diff --git a/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Transforms/TileToInParallel.cpp b/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Transforms/TileToInParallel.cpp
new file mode 100644
index 0000000..83ece71
--- /dev/null
+++ b/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Transforms/TileToInParallel.cpp
@@ -0,0 +1,132 @@
+//===- TileToInParallel.cpp.cpp - Rewrite TileOp as InParallel -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.h"
+#include "iree-dialects/Dialect/LinalgExt/Transforms/Transforms.h"
+#include "iree-dialects/Dialect/LinalgExt/Transforms/Utils.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/SCF/SCF.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+using namespace mlir;
+using namespace mlir::iree_compiler::IREE::LinalgExt;
+
+FailureOr<iree_compiler::IREE::LinalgExt::InParallelOp> mlir::iree_compiler::
+    IREE::LinalgExt::TileOpToInParallelRewriter::returningMatchAndRewrite(
+        iree_compiler::IREE::LinalgExt::TileOp tileOp,
+        PatternRewriter &rewriter) const {
+  // TODO: verifier.
+  assert(tileOp.getNumResults() > 0 &&
+         tileOp.outs().size() == tileOp.getNumResults());
+
+  // TODO: when supported, iterate over the tensor of sizes. This will be
+  // iterating through a level of indirection.
+
+  int64_t tiledDim = tileOp.tiled_dim();
+
+  // Construct the loop bounds based on the canonical arithmetic progression.
+  Location loc = tileOp.getLoc();
+  Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+  Value tiledDimValue = rewriter.create<arith::ConstantIndexOp>(loc, tiledDim);
+  Value one = rewriter.create<arith::ConstantIndexOp>(loc, 1);
+  Value totalSize =
+      rewriter.create<tensor::DimOp>(loc, tileOp.outs().front(), tiledDimValue);
+  Value step = tileOp.tile_size();
+  assert(step.getType().isa<IndexType>() && "NYI: not an index type");
+
+  using AV = AffineValueExpr;
+  AffineBuilder ab(rewriter, loc);
+  AffineExpr i, j, M;
+  bindDims(rewriter.getContext(), i, j);
+  bindSymbols(rewriter.getContext(), M);
+  Value numThreads = ab.ceil(AV(i).bind(totalSize), AV(M).bind(step));
+
+  // Construct the op without a body builder: we need to clone the ops in the
+  // body explicitly after having access to the new bbArgs.
+  // As a consequence, `ensureTerminator` is not called and the body has no
+  // terminator.
+  iree_compiler::IREE::LinalgExt::InParallelOp inParallelOp =
+      rewriter.create<iree_compiler::IREE::LinalgExt::InParallelOp>(
+          loc, tileOp->getResultTypes(), numThreads);
+
+  // At the beginning of the InParallelOp, compute offset and sizes.
+  rewriter.setInsertionPointToStart(inParallelOp.getBody());
+
+  // Materialize the implicit subtensors as explicit subset_extract.
+  // TODO: generalize to multiple offset/chunk_size bbargs if needed.
+  // TODO: generalize the subset op.
+  SmallVector<Value> leadingOffsets, leadingSizes, leadingStrides;
+  for (int64_t i = 0; i < tiledDim; ++i) {
+    leadingOffsets.push_back(zero);
+    leadingSizes.push_back(
+        rewriter.createOrFold<tensor::DimOp>(loc, tileOp.outs().front(), i));
+    leadingStrides.push_back(one);
+  }
+  // clang-format off
+    Value offset = ab.mul(AV(i).bind(inParallelOp.getThreadIndex()), 
+                          AV(M).bind(step));
+    Value size = ab.min(
+      ValueRange{ab.sub(AV(i).bind(totalSize), AV(j).bind(offset)),
+      step});
+  // clang-format on
+  leadingOffsets.push_back(offset);
+  leadingSizes.push_back(size);
+  leadingStrides.push_back(one);
+
+  SmallVector<Value> implicitSubtensorExtracts;
+  for (Value tensor : tileOp.outs()) {
+    implicitSubtensorExtracts.push_back(
+        createSubsetExtractOpFromLeadingOffsetsSizesAndStrides(
+            rewriter, loc, tensor, leadingOffsets, leadingSizes,
+            leadingStrides));
+  }
+
+  // Get a reference to the TileOp terminator before the body is merged and it
+  // becomes too hard to get to the terminator.
+  auto tileYieldOp = cast<TileYieldOp>(tileOp.getBody()->getTerminator());
+
+  // Regroup the values that replace the tileOp's bbArg and move the body.
+  SmallVector<Value> bbArgsTranslated{offset, size};
+  llvm::append_range(bbArgsTranslated, implicitSubtensorExtracts);
+  rewriter.mergeBlockBefore(&tileOp.region().front(),
+                            inParallelOp.getBody()->getTerminator(),
+                            bbArgsTranslated);
+
+  // tileOp's terminator is not the terminator, insert explicit subset_insert
+  // ops and feed them to a new scf.yield terminator that we can now add.
+  PerformConcurrentlyOp performConcurrentlyOp = inParallelOp.getTerminator();
+
+  for (auto it : llvm::zip(tileYieldOp->getOperands(), tileOp.outs())) {
+    SmallVector<Value> offsets, sizes, strides;
+    completeOffsetsSizesAndStrides(rewriter, loc, std::get<0>(it),
+                                   leadingOffsets, leadingSizes, leadingStrides,
+                                   offsets, sizes, strides);
+    OpBuilder::InsertionGuard g(rewriter);
+    rewriter.setInsertionPoint(
+        performConcurrentlyOp.getBody()->getTerminator());
+    createParallelInsertSliceOpFromLeadingOffsetsSizesAndStrides(
+        rewriter, loc, std::get<0>(it), std::get<1>(it), offsets, sizes,
+        strides);
+  }
+
+  // Cleanup and replace.
+  rewriter.eraseOp(tileYieldOp);
+  rewriter.replaceOp(tileOp, inParallelOp.getResults());
+
+  return inParallelOp;
+}
diff --git a/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Transforms/TileToSequentialFor.cpp b/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Transforms/TileToSequentialFor.cpp
new file mode 100644
index 0000000..657eedd
--- /dev/null
+++ b/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Transforms/TileToSequentialFor.cpp
@@ -0,0 +1,106 @@
+//===- LowerToSCF.cpp.cpp - Lower to SCF ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+
+#include "iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.h"
+#include "iree-dialects/Dialect/LinalgExt/Transforms/Transforms.h"
+#include "iree-dialects/Dialect/LinalgExt/Transforms/Utils.h"
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/SCF/SCF.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+using namespace mlir;
+using namespace mlir::iree_compiler::IREE::LinalgExt;
+
+FailureOr<scf::ForOp> mlir::iree_compiler::IREE::LinalgExt::
+    TileOpToSCFRewriter::returningMatchAndRewrite(
+        iree_compiler::IREE::LinalgExt::TileOp tileOp,
+        PatternRewriter &rewriter) const {
+  // TODO: verifier.
+  assert(tileOp.getNumResults() > 0 &&
+         tileOp.outs().size() == tileOp.getNumResults());
+
+  // TODO: when supported, iterate over the tensor of sizes. This will be
+  // iterating through a level of indirection.
+
+  // Construct the loop bounds based on the canonical arithmetic progression.
+  Location loc = tileOp.getLoc();
+  Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+  Value one = rewriter.create<arith::ConstantIndexOp>(loc, 1);
+  Value totalSize =
+      rewriter.create<tensor::DimOp>(loc, tileOp.outs().front(), zero);
+  Value step = tileOp.tile_size();
+  assert(step.getType().isa<IndexType>() && "NYI: not an index type");
+
+  // Construct the op without a body builder: we need to clone the ops in the
+  // body explicitly after having access to the new bbArgs.
+  // As a consequence, `ensureTerminator` is not called and the body has no
+  // terminator.
+  scf::ForOp forOp =
+      rewriter.create<scf::ForOp>(loc, zero, totalSize, step, tileOp.outs());
+
+  rewriter.setInsertionPointToStart(forOp.getBody());
+
+  // TODO: when supported, also compute from the tensor of sizes.
+  using AV = AffineValueExpr;
+  AffineBuilder ab(rewriter, loc);
+  AffineExpr i, j, M;
+  bindDims(rewriter.getContext(), i, j);
+  bindSymbols(rewriter.getContext(), M);
+
+  // Materialize the implicit subtensors as explicit subset_extract.
+  // TODO: generalize to multiple offset/chunk_size bbargs if needed.
+  // TODO: generalize the subset op.
+  Value offset = forOp.getInductionVar();
+  // clang-format off
+    Value size = ab.min(
+      ValueRange{ab.sub(AV(i).bind(totalSize), AV(j).bind(offset)),
+      step});
+  // clang-format on
+  SmallVector<Value> implicitSubtensorExtracts;
+  for (Value tensor : forOp.getRegionIterArgs()) {
+    implicitSubtensorExtracts.push_back(
+        createSubsetExtractOpFromLeadingOffsetsSizesAndStrides(
+            rewriter, loc, tensor, offset, size, one));
+  }
+
+  // Regroup the values that replace the tileOp's bbArg and move the body.
+  SmallVector<Value> bbArgsTranslated{offset, size};
+  llvm::append_range(bbArgsTranslated, implicitSubtensorExtracts);
+  rewriter.mergeBlocks(&tileOp.region().front(), forOp.getBody(),
+                       bbArgsTranslated);
+  // tileOp's terminator is not the terminator, insert explicit subset_insert
+  // ops and feed them to a new scf.yield terminator that we can now add.
+  auto tileYieldOp = cast<TileYieldOp>(&forOp.getBody()->back());
+  SmallVector<Value> implicitSubtensorInserts;
+  for (auto it : llvm::zip(implicitSubtensorExtracts, tileYieldOp.getOperands(),
+                           forOp.getRegionIterArgs())) {
+    implicitSubtensorInserts.push_back(createMatchingSubsetInsertOp(
+        rewriter, loc,
+        /*subsetExtractOp=*/
+        std::get<0>(it).getDefiningOp<tensor::ExtractSliceOp>(),
+        /*source=*/std::get<1>(it), /*dest=*/std::get<2>(it)));
+  }
+  // Insert terminator.
+  rewriter.setInsertionPointToEnd(forOp.getBody());
+  rewriter.create<scf::YieldOp>(loc, implicitSubtensorInserts);
+
+  // Cleanup and replace.
+  rewriter.eraseOp(tileYieldOp);
+  rewriter.replaceOp(tileOp, forOp.getResults());
+
+  return forOp;
+}
diff --git a/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Transforms/Tiling.cpp b/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Transforms/Tiling.cpp
index 25df1f8..0e55970 100644
--- a/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Transforms/Tiling.cpp
+++ b/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Transforms/Tiling.cpp
@@ -1,360 +1,216 @@
-// Copyright 2021 The IREE Authors
+//===- Tiling.cpp - Tiling using TilingInterface --------------------------===//
 //
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
 
-#include "iree-dialects/Dialect/Input/InputDialect.h"
-#include "iree-dialects/Dialect/Input/InputOps.h"
-#include "iree-dialects/Dialect/LinalgExt/IR/LinalgExtDialect.h"
 #include "iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.h"
-#include "iree-dialects/Dialect/LinalgExt/Transforms/PassDetail.h"
-#include "iree-dialects/Dialect/LinalgExt/Transforms/Passes.h"
-#include "iree-dialects/Dialect/LinalgExt/Transforms/Transforms.h"
-#include "llvm/ADT/TypeSwitch.h"
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/SCF/SCF.h"
+#include "iree-dialects/Dialect/LinalgExt/Transforms/Utils.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/Linalg/Utils/Utils.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Utils/StaticValueUtils.h"
-#include "mlir/IR/Matchers.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/OperationSupport.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 using namespace mlir;
-namespace IREE = mlir::iree_compiler::IREE;
-using namespace IREE::LinalgExt;
+using namespace mlir::iree_compiler::IREE::LinalgExt;
 
-//===----------------------------------------------------------------------===//
-// Utility methods for tiling a linalg_ext operation that implements a
-// TiledOpInterface
-//===----------------------------------------------------------------------===//
+// TODO: connect these patterns to PDL. Either via the transform dialect or via
+// PDLL.
 
-/// Returns failure if the options are unsupported.
-static LogicalResult verifySupportedTilingOptions(
-    PatternRewriter &rewriter, Operation *op,
-    const linalg::LinalgTilingOptions &options) {
-  if (!options.interchangeVector.empty()) {
-    return rewriter.notifyMatchFailure(op,
-                                       "unsupported interchange during tiling");
+static bool isZero(Value v) {
+  if (auto cst = v.getDefiningOp<arith::ConstantIndexOp>())
+    return cst.value() == 0;
+  return false;
+}
+
+SmallVector<Value> tileToSCF(PatternRewriter &rewriter, TilingInterface op,
+                             TilingInterface clonedOp, ValueRange tileSizes) {
+  // Compute lower and upper bounds of the loop nest.
+  SmallVector<Range> ranges = clonedOp.getIterationDomain(rewriter);
+  assert(tileSizes.size() <= ranges.size() &&
+         "expected tile sizes to match the number of loops");
+
+  // Fill the tile sizes with zeros for the untiled dimensions.
+  Location loc = op->getLoc();
+  SmallVector<Value> tileSizesVec(tileSizes.begin(), tileSizes.end());
+  if (ranges.size() != tileSizes.size()) {
+    Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+    tileSizesVec.resize(ranges.size(), zero);
   }
-  if (options.loopType != linalg::LinalgTilingLoopType::Loops) {
-    return rewriter.notifyMatchFailure(op,
-                                       "only tiling with scf.for is supported");
-  }
-  if (options.distribution) {
-    if (llvm::any_of(options.distribution->distributionMethod,
-                     [](linalg::DistributionMethod method) {
-                       return method != linalg::DistributionMethod::Cyclic;
-                     })) {
-      return rewriter.notifyMatchFailure(op,
-                                         "only cyclic distibution is allowed");
+
+  SmallVector<Value> lbs, dims, allDims, steps;
+  for (auto it : llvm::enumerate(ranges)) {
+    allDims.push_back(it.value().size);
+    if (!isZero(tileSizesVec[it.index()])) {
+      lbs.push_back(it.value().offset);
+      dims.push_back(it.value().size);
+      steps.push_back(tileSizesVec[it.index()]);
     }
   }
-  return success();
-}
 
-/// Converts an `OpFoldResult` to a `Value` by building a constant op if
-/// if the `OpFoldResult` is an `IntegerAttr`.
-static Value getValue(OpBuilder &builder, Location loc,
-                      OpFoldResult valueOrAttr) {
-  if (auto attr = valueOrAttr.dyn_cast<Attribute>()) {
-    return builder.create<arith::ConstantIndexOp>(
-        loc, attr.cast<IntegerAttr>().getInt());
-  }
-  return valueOrAttr.get<Value>();
-}
-
-/// Returns true if loop is untiled. Only checks if the value is statically
-/// zero. It is assumed that a `Value` defined by a constant op is already
-/// converted to an `IntegerAttr` of that value. So here just return true if
-/// this is an attribute with a zero value.
-static bool isUntiledLoop(OpFoldResult valueOrAttr) {
-  Optional<int64_t> intVal = getConstantIntValue(valueOrAttr);
-  return intVal && *intVal == 0;
-}
-
-/// Generates the tiled loops and the body by invoking the interface methods of
-/// TiledOpInterface.
-/// - `outputs` are the operands to use for outputs of the tiled operation.
-/// - `tileSizes` are tile sizes specified for all loops of the operation. If a
-///   loop is to be untiled it is set to 0.
-/// - `iteratorType` is the type of the loop iterator returned by the
-///   TiledOpInterface.
-/// - `loopBounds` are the bounds of all the loops of the op returned by the
-///   TiledOpInterface.
-/// - `loopDepth` is the current loop depth being processed.
-/// - `offsets` are the `Value`s that represent the position of the tile being
-///   operated on. The offsets are computed as the tiled loops are being
-///   generated.
-/// - `distributionInfo` is the proc_id and nprocs `Value`s to be used for
-///   distributed loops. It is a stack, and once an entry at the top of the
-///   stack is used for distribution it is popped before processing the inner
-///   loops.
-static FailureOr<TiledOp> tileInterfaceOpImpl(
-    OpBuilder &builder, TiledOpInterface tilableOp, ValueRange outputs,
-    MutableArrayRef<OpFoldResult> tileSizes, ArrayRef<StringRef> iteratorTypes,
-    ArrayRef<Range> loopBounds, unsigned loopDepth,
-    SmallVectorImpl<OpFoldResult> &offsets,
-    ArrayRef<linalg::ProcInfo> distributionInfo) {
-  Location loc = tilableOp.getLoc();
-  // If this is the innermost loop, then generated the tiled implementation of
-  // the op by invoking the TiledOpInterface methods.
-  if (loopDepth == tileSizes.size()) {
-    TiledOp ret;
-    ret.op = tilableOp.getTiledImplementation(builder, outputs, offsets,
-                                              tileSizes, ret.results);
-    if (!ret.op) {
-      return static_cast<LogicalResult>(
-          tilableOp.emitOpError("failed to get tiled implementation"));
-    }
-    return ret;
-  }
-
-  // If tile size at this depth is empty, do nothing.
-  if (isUntiledLoop(tileSizes[loopDepth])) {
-    auto zeroAttr = builder.getI64IntegerAttr(0);
-    offsets.push_back(zeroAttr);
-    assert(matchPattern(loopBounds[loopDepth].offset, m_Zero()) &&
-           "expected loop bounds to have lower bound of zero");
-    tileSizes[loopDepth] = getAsOpFoldResult(loopBounds[loopDepth].size);
-    return tileInterfaceOpImpl(builder, tilableOp, outputs, tileSizes,
-                               iteratorTypes, loopBounds, loopDepth + 1,
-                               offsets, distributionInfo);
-  }
-
-  // Generate an scf.for for the current loop depth.
-  Value lb = loopBounds[loopDepth].offset;
-  Value ub = loopBounds[loopDepth].size;
-  // TODO(#7073): Put the check back. This is required by tiling linalg_ext.fft
-  // op. We can put the check back after updating linalg_ext.fft semantics.
-  // if (!matchPattern(loopBounds[loopDepth].stride, m_One())) {
-  // return static_cast<LogicalResult>(
-  // tilableOp.emitOpError("expected stride to be 1"));
-  //}
-  Value step = getValue(builder, loc, tileSizes[loopDepth]);
-
-  // Update lb, ub and step for cyclic distribution.
-  if (!distributionInfo.empty() &&
-      iteratorTypes[loopDepth] == getParallelIteratorTypeName()) {
-    linalg::updateBoundsForCyclicDistribution(
-        builder, loc, distributionInfo.front().procId,
-        distributionInfo.front().nprocs, lb, ub, step);
-    distributionInfo = distributionInfo.drop_front();
-  }
-  FailureOr<TiledOp> innerReturnValue;
-  bool isBufferTiling = tilableOp->getNumResults() == 0;
-  ValueRange initValues(isBufferTiling ? ValueRange{} : outputs);
-  auto forOp = builder.create<scf::ForOp>(
-      loc, lb, ub, step, initValues,
-      [&](OpBuilder &b, Location loc, Value iv, ValueRange args) {
-        offsets.push_back(iv);
-        auto affineMaps = AffineMap::inferFromExprList({ArrayRef<AffineExpr>{
-            b.getAffineSymbolExpr(0),
-            b.getAffineSymbolExpr(1) - b.getAffineDimExpr(0)}})[0];
-        // Similar to linalg tiling, the tile size is the min(tileSizes, ub -
-        // iv) to account for cases where tile size does not divide (ub - lb)
-        // exactly.
-        Value inBoundsTileSize = b.create<AffineMinOp>(
-            loc, affineMaps,
-            ValueRange{iv, getValue(builder, loc, tileSizes[loopDepth]), ub});
-        tileSizes[loopDepth] = getAsOpFoldResult(inBoundsTileSize);
-        // Recursively proceed to generate the tiled loop for the next level.
-        innerReturnValue =
-            tileInterfaceOpImpl(b, tilableOp, (isBufferTiling ? outputs : args),
-                                tileSizes, iteratorTypes, loopBounds,
-                                loopDepth + 1, offsets, distributionInfo);
-        if (failed(innerReturnValue)) return;
-        b.create<scf::YieldOp>(loc, innerReturnValue->results);
+  // Generate loop nest: One loop per dimension.
+  llvm::SmallPtrSet<Operation *, 1> preservedUses;
+  SmallVector<Value> destOperand = clonedOp.getDestinationOperands(rewriter);
+  auto loopNest = mlir::scf::buildLoopNest(
+      rewriter, loc, lbs, /*ubs=*/dims, steps, ValueRange(destOperand),
+      [&](OpBuilder &b, Location loc, ValueRange localIvs,
+          ValueRange iterArgs) -> scf::ValueVector {
+        // Compute offsets and sizes of ExtractSliceOp.
+        SmallVector<Value> offsets =
+            linalg::computeTileOffsets(b, loc, localIvs, tileSizesVec);
+        SmallVector<Value> sizes =
+            linalg::computeTileSizes(b, loc, localIvs, tileSizesVec, allDims);
+        // Create ExtractSliceOp: Extract a tile from the PadOp.
+        // Note: The PadOp is located outside of the loop nest. It is
+        // later moved inside by ExtractSliceOfPadTensorSwapPattern.
+        auto map =
+            AffineMap::getMultiDimIdentityMap(ranges.size(), b.getContext());
+        assert(clonedOp->getNumResults() == 1 && "expected single result op");
+        Value tiledOutput =
+            linalg::makeTiledShape(b, loc, clonedOp->getResult(0), tileSizesVec,
+                                   map, offsets, allDims, sizes);
+        auto sliceOp = tiledOutput.getDefiningOp<tensor::ExtractSliceOp>();
+        preservedUses.insert(sliceOp);
+        assert(sliceOp && "expected ExtractSliceOp");
+        // Insert the tile into the output tensor.
+        Value yieldValue =
+            createMatchingSubsetInsertOp(b, loc, sliceOp, sliceOp, iterArgs[0]);
+        return scf::ValueVector({yieldValue});
       });
-  if (failed(innerReturnValue)) {
-    return innerReturnValue;
-  }
-  innerReturnValue->loops.insert(innerReturnValue->loops.begin(),
-                                 forOp.getOperation());
-  innerReturnValue->results = forOp.getResults();
-  return innerReturnValue;
+  return loopNest.getResults();
 }
 
-FailureOr<TiledOp> tileInterfaceOp(OpBuilder &b, TiledOpInterface tilableOp,
-                                   const linalg::LinalgTilingOptions &options) {
-  SmallVector<Value> dest = tilableOp.getDestinationOperands(b);
-  if (dest.empty()) {
-    return static_cast<LogicalResult>(tilableOp.emitOpError(
-        "cannot tile operation without destination operands"));
-  }
-
-  SmallVector<StringRef> iteratorTypes = tilableOp.getLoopIteratorTypes();
-  SmallVector<Value, 4> tileSizesVals =
-      options.tileSizeComputationFunction(b, tilableOp);
-  auto zeroAttr = b.getI64IntegerAttr(0);
-
-  // The actual tile sizes used converts `Value` defined as constant 0, to a
-  // zero integer attributes. Currently if the iterator type is not "parallel",
-  // the tile size is forced to zero as well.
-  auto tileSizes = getAsOpFoldResult(tileSizesVals);
-  tileSizes.resize(iteratorTypes.size(), zeroAttr);
-  for (auto en : llvm::enumerate(iteratorTypes)) {
-    if (en.value() == getParallelIteratorTypeName()) continue;
-    if (!isUntiledLoop(tileSizes[en.index()])) {
-      return static_cast<LogicalResult>(tilableOp.emitOpError(
-          "unimplemented tiling of non-parallel loop iterator type"));
-    }
-  }
-
-  // Trivial early exit case of tile sizes being zero for all parallel loops.
-  if (llvm::all_of(tileSizes, isUntiledLoop)) {
-    return TiledOp{tilableOp, {}, {}};
-  }
-
-  SmallVector<Range> loopBounds = tilableOp.getIterationDomain(b);
-  SmallVector<linalg::ProcInfo> distributionInfo;
-  // If the tiled loops are distributed, get the proc_id and nprocs for the
-  // distributed loops. First collect the parallel loops by iterating over the
-  // tileSizes and getting the loops that are distribute, i.e.,
-  // - parallel, i.e. iteratorTypes is "parallel"
-  // - tiled, i.e. tileSize != 0
-  if (options.distribution) {
-    SmallVector<Range> distributedLoopRange;
-    for (auto i : llvm::seq<unsigned>(0, tileSizes.size())) {
-      if (isUntiledLoop(tileSizes[i])) continue;
-      if (iteratorTypes[i] != getParallelIteratorTypeName()) continue;
-      distributedLoopRange.push_back(loopBounds[i]);
-    }
-    distributionInfo = options.distribution->procInfo(b, tilableOp.getLoc(),
-                                                      distributedLoopRange);
-  }
-
-  SmallVector<OpFoldResult> offsets;
-  return tileInterfaceOpImpl(b, tilableOp, dest, tileSizes, iteratorTypes,
-                             loopBounds, 0, offsets, distributionInfo);
-}
-
-LogicalResult TiledOpInterfaceBaseTilingPattern::matchAndRewriteBase(
-    TiledOpInterface tilableOp, PatternRewriter &rewriter,
-    TiledOp &result) const {
-  if (failed(filter.checkAndNotify(rewriter, tilableOp))) {
-    return failure();
-  }
-  if (failed(verifySupportedTilingOptions(rewriter, tilableOp, options))) {
-    return failure();
-  }
-
-  FailureOr<TiledOp> res = tileInterfaceOp(rewriter, tilableOp, options);
-  if (failed(res)) return res;
-  result = *res;
-  if (result.op) {
-    filter.replaceLinalgTransformationFilter(rewriter, result.op);
-  }
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// Test pass for tiling Linalg Ext ops
-//===----------------------------------------------------------------------===//
-
 namespace {
-struct TiledOpInterfaceTilingPass
-    : public TiledOpInterfaceTilingBase<TiledOpInterfaceTilingPass> {
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<
-        AffineDialect, IREE::Input::IREEInputDialect, linalg::LinalgDialect,
-        IREE::LinalgExt::IREELinalgExtDialect, memref::MemRefDialect,
-        func::FuncDialect, mlir::arith::ArithmeticDialect, math::MathDialect,
-        tensor::TensorDialect, scf::SCFDialect>();
+
+/// The tiling here works by two steps. The first step is to create a loop based
+/// on the loop bounds of the operation obtained from `TilingInterface`.
+///
+/// ```mlir
+///   %1 = <tiling interface op> ins(...) outs(%0 : ...)
+///   ... <use_op> ... %1 ...
+/// ```
+///
+/// is rewritten using a "noop" subtensor extract/insert pair
+///
+/// ```mlir
+///   %1 = <tiling interface op> ins(...) outs(%0 : ...)
+///   %2 = scf.for %iv0 = ... iter_args(%arg0 = %0) {
+///     %3 = scf.for %iv1 = ... iter_args(%arg1 = %arg0) {
+///       ...
+///       %4 = tensor.extract_slice %1[%iv0, %iv1]....
+///       %5 = tensor.insert_slice %4 into %arg1[%iv0, %iv1]...
+///       scf.yield %5
+///     }
+///     scf.yield %3
+///   }
+///   ... <use_op> ... %2 ...
+/// ```
+///
+/// Following this the `TilingInterface` -> `tensor::ExtractSliceOp` pattern is
+/// replaced with
+///
+/// /// ```mlir
+///   %2 = scf.for %iv0 = ... iter_args(%arg0 = %0) {
+///     %3 = scf.for %iv1 = ... iter_args(%arg1 = %arg0) {
+///       ...
+///       %4 = tensor.extract_slice %0[%iv0, %iv1]
+///       %5 = <tiling interface op> ins(...) outs(%4 : ...)
+///       %6 = tensor.insert_slice %5 into %arg1[%iv0, %iv1]...
+///       scf.yield %6
+///     }
+///     scf.yield %3
+///   }
+///   ... <use_op> ... %2 ...
+/// ```
+///
+/// TODO(ravishankarm): The current approach seems to work for only tiling the
+/// parallel loops of the operation. Specifically,
+/// 1) the `%0` in the third snippet needs to be `%arg1`, for cases where the
+///    tiled loop is a reduction.
+/// 2) Current implementation is using the `getIterationDomain` method to get
+/// the
+///    initial loop structure as described in the second snippet. If any of
+///    those loops are reductions, then that IR snippet itself is wrong (replace
+///    this with the case of `linalg.matmul` and the error becomes apparent).
+
+/// First pattern to introduce the loop nests.
+struct OpTilingPattern : public OpInterfaceRewritePattern<TilingInterface> {
+  OpTilingPattern(MLIRContext *context, linalg::LinalgTilingOptions opt,
+                  linalg::LinalgTransformationFilter filt)
+      : OpInterfaceRewritePattern<TilingInterface>(context),
+        options(opt),
+        filter(filt) {}
+
+  LogicalResult matchAndRewrite(TilingInterface op,
+                                PatternRewriter &rewriter) const override {
+    if (failed(filter.checkAndNotify(rewriter, op))) return failure();
+
+    /// Currently only handle single result operations.
+    if (op->getNumResults() != 1) return failure();
+
+    Location loc = op->getLoc();
+    // Get rank and tile sizes.
+    SmallVector<Value> tileSizes =
+        options.tileSizeComputationFunction(rewriter, op);
+    auto iteratorTypes = op.getLoopIteratorTypes();
+    Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+    tileSizes.resize(iteratorTypes.size(), zero);
+
+    /// Currently only handle operations with all parallel iterator types.
+    for (auto iteratorType : enumerate(iteratorTypes)) {
+      if (iteratorType.value() != getParallelIteratorTypeName() &&
+          !isZero(tileSizes[iteratorType.index()])) {
+        return rewriter.notifyMatchFailure(
+            op, "unhandled tiling of non-parallel iterator");
+      }
+    }
+
+    auto clonedOp = cast<TilingInterface>(rewriter.clone(*op.getOperation()));
+    SmallVector<Value> results = tileToSCF(rewriter, op, clonedOp, tileSizes);
+
+    filter.replaceLinalgTransformationFilter(rewriter, clonedOp);
+    rewriter.replaceOp(op, results);
+    return success();
   }
-  void runOnOperation() override;
+
+ private:
+  linalg::LinalgTilingOptions options;
+  linalg::LinalgTransformationFilter filter;
 };
-}  // namespace
 
-template <typename OpTy>
-static Value buildFlowWorkgroupInfoOp(OpBuilder &b, unsigned dim) {
-  return b.template create<OpTy>(b.getInsertionPoint()->getLoc(), dim);
-}
+/// Second pattern to implement the switch of `TilingInterface ->
+/// tensor.extract_slice` to `tensor.extract_slice -> `TilingInterface`.
+struct SliceOpTiledOpSwapPattern
+    : public OpRewritePattern<tensor::ExtractSliceOp> {
+  SliceOpTiledOpSwapPattern(MLIRContext *context,
+                            linalg::LinalgTilingOptions opt,
+                            linalg::LinalgTransformationFilter filt)
+      : OpRewritePattern<tensor::ExtractSliceOp>(context),
+        options(opt),
+        filter(filt) {}
 
-void TiledOpInterfaceTilingPass::runOnOperation() {
-  FuncOp funcOp = getOperation();
-  MLIRContext *context = funcOp.getContext();
-
-  RewritePatternSet patterns(context);
-  patterns.add<TiledOpInterfaceTilingPattern>(
-      context, linalg::LinalgTilingOptions().setTileSizes({10, 20}),
-      linalg::LinalgTransformationFilter(
-          StringAttr::get(context, "tiling_input"),
-          StringAttr::get(context, "tiling_output")));
-  patterns.add<TiledOpInterfaceTilingPattern>(
-      context, linalg::LinalgTilingOptions().setTileSizes(ArrayRef<int64_t>{0}),
-      linalg::LinalgTransformationFilter(
-          StringAttr::get(context, "no_tiling_input"),
-          StringAttr::get(context, "no_tiling_output")));
-
-  patterns.add<TiledOpInterfaceTilingPattern>(
-      context, linalg::LinalgTilingOptions().setTileSizes({0, 20}),
-      linalg::LinalgTransformationFilter(
-          StringAttr::get(context, "outer_reduce_input"),
-          StringAttr::get(context, "outer_reduce_output")));
-  patterns.add<TiledOpInterfaceTilingPattern>(
-      context, linalg::LinalgTilingOptions().setTileSizes({10, 0, 0}),
-      linalg::LinalgTransformationFilter(
-          StringAttr::get(context, "inner_reduce_input"),
-          StringAttr::get(context, "inner_reduce_output")));
-
-  static linalg::LinalgLoopDistributionOptions workgroupDistributionOptions = {
-      [](OpBuilder &builder, Location loc, ArrayRef<Range> parallelLoopRanges) {
-        auto numParallelDims = parallelLoopRanges.size();
-
-        SmallVector<linalg::ProcInfo, 3> procInfo(numParallelDims);
-        for (size_t dim = 0; dim < numParallelDims; ++dim) {
-          procInfo[numParallelDims - dim - 1] = {
-              buildFlowWorkgroupInfoOp<IREE::Input::DispatchWorkgroupIDOp>(
-                  builder, dim),
-              buildFlowWorkgroupInfoOp<IREE::Input::DispatchWorkgroupCountOp>(
-                  builder, dim)};
-        }
-        return procInfo;
-      },
-      {linalg::DistributionMethod::Cyclic, linalg::DistributionMethod::Cyclic,
-       linalg::DistributionMethod::Cyclic},
-      DenseMap<StringRef,
-               std::function<linalg::ProcInfo(OpBuilder &, Location)>>()};
-
-  patterns.add<TiledOpInterfaceTilingPattern>(
-      context,
-      linalg::LinalgTilingOptions()
-          .setTileSizes(ArrayRef<int64_t>{10, 0, 30})
-          .setDistributionOptions(workgroupDistributionOptions),
-      linalg::LinalgTransformationFilter(
-          StringAttr::get(context, "distribute_input"),
-          StringAttr::get(context, "distribute_output")));
-
-  patterns.add<TiledOpInterfaceTilingPattern>(
-      context,
-      linalg::LinalgTilingOptions().setTileSizes(ArrayRef<int64_t>{32}),
-      linalg::LinalgTransformationFilter(
-          StringAttr::get(context, "tiling_1d_stage5_fft_input"),
-          StringAttr::get(context, "tiling_1d_stage5_fft_output")));
-
-  patterns.add<TiledOpInterfaceTilingPattern>(
-      context,
-      linalg::LinalgTilingOptions().setTileSizes(ArrayRef<int64_t>{10, 32}),
-      linalg::LinalgTransformationFilter(
-          StringAttr::get(context, "tiling_2d_stage5_fft_input"),
-          StringAttr::get(context, "tiling_2d_stage5_fft_output")));
-
-  patterns.add<TiledOpInterfaceTilingPattern>(
-      context, linalg::LinalgTilingOptions().setTileSizes({0, 20}),
-      linalg::LinalgTransformationFilter(
-          StringAttr::get(context, "tiling_repeated_indices_scatter_input"),
-          StringAttr::get(context, "tiling_repeated_indices_scatter_output")));
-
-  if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns)))) {
-    return signalPassFailure();
+  LogicalResult matchAndRewrite(tensor::ExtractSliceOp sliceOp,
+                                PatternRewriter &rewriter) const override {
+    auto sourceOp = sliceOp.source().getDefiningOp<TilingInterface>();
+    if (!sourceOp || !filter.hasReplacementFilter(sourceOp)) return failure();
+    SmallVector<Operation *> tiledOps = sourceOp.getTiledImplementation(
+        rewriter, sourceOp.getDestinationOperands(rewriter),
+        sliceOp.getMixedOffsets(), sliceOp.getMixedSizes(),
+        /*tileDestOperands=*/true);
+    assert(tiledOps.size() && "expected single tiled op");
+    Operation *tiledOp = tiledOps.front();
+    rewriter.replaceOp(sliceOp, tiledOp->getResults());
+    return success();
   }
-}
 
-std::unique_ptr<OperationPass<FuncOp>>
-IREE::LinalgExt::createTiledOpInterfaceTilingPass() {
-  return std::make_unique<TiledOpInterfaceTilingPass>();
-}
+ private:
+  linalg::LinalgTilingOptions options;
+  linalg::LinalgTransformationFilter filter;
+};
+
+}  // namespace
diff --git a/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Transforms/TilingExternalModels.cpp b/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Transforms/TilingExternalModels.cpp
new file mode 100644
index 0000000..7174daa
--- /dev/null
+++ b/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Transforms/TilingExternalModels.cpp
@@ -0,0 +1,178 @@
+//===- TilingExternalModels.cpp - External models for TilingInterface -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "iree-dialects/Dialect/LinalgExt/Passes/Passes.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Debug.h"
+#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/Utils/Utils.h"
+#include "mlir/Interfaces/TilingInterface.h"
+
+#define DEBUG_TYPE "linalg-ext-tiling"
+
+using namespace mlir;
+using namespace mlir::linalg;
+using namespace mlir::iree_compiler::IREE::LinalgExt;
+
+static Value getAsValue(OpBuilder &b, Location loc, OpFoldResult ofr) {
+  if (auto v = ofr.dyn_cast<Value>()) return v;
+  return b.create<arith::ConstantIndexOp>(
+      loc, ofr.get<Attribute>().cast<IntegerAttr>().getInt());
+}
+static SmallVector<Value> getAsValues(OpBuilder &b, Location loc,
+                                      ArrayRef<OpFoldResult> ofrs) {
+  SmallVector<Value> vals;
+  vals.reserve(ofrs.size());
+  for (auto ofr : ofrs) vals.push_back(getAsValue(b, loc, ofr));
+  return vals;
+}
+
+static SmallVector<Value, 4> makeTiledInputShapes(OpBuilder &b, Location loc,
+                                                  LinalgOp linalgOp,
+                                                  ArrayRef<Value> valuesToTile,
+                                                  ArrayRef<Value> ivsRef,
+                                                  ArrayRef<Value> tileSizesRef,
+                                                  ArrayRef<Value> sizeBounds) {
+  assert(static_cast<int64_t>(valuesToTile.size()) == linalgOp.getNumInputs() &&
+         "expected one value to tile for every operand");
+
+  Value zero = b.create<arith::ConstantIndexOp>(loc, 0);
+  SmallVector<Value> tileSizes{tileSizesRef.begin(), tileSizesRef.end()};
+  tileSizes.append(sizeBounds.size() - tileSizes.size(), zero);
+
+  // Construct (potentially temporary) mins and maxes on which to apply maps
+  // that define tile subshapes.
+  SmallVector<Value> lbs = computeTileOffsets(b, loc, ivsRef, tileSizes);
+  SmallVector<Value> subShapeSizes =
+      computeTileSizes(b, loc, ivsRef, tileSizes, sizeBounds);
+
+  SmallVector<Value, 4> tiledShapes;
+  tiledShapes.reserve(valuesToTile.size());
+  for (OpOperand *opOperand : linalgOp.getInputOperands()) {
+    Value shapedOp = valuesToTile[opOperand->getOperandNumber()];
+    LLVM_DEBUG(llvm::dbgs() << "makeTiledShapes: for operand " << shapedOp);
+    AffineMap map = linalgOp.getTiedIndexingMap(opOperand);
+    LLVM_DEBUG(llvm::dbgs() << ": tiled: figure out subshape...\n");
+    tiledShapes.push_back(makeTiledShape(b, loc, shapedOp, tileSizes, map, lbs,
+                                         sizeBounds, subShapeSizes));
+  }
+
+  return tiledShapes;
+}
+
+namespace {
+
+/// External model implementation of TilingInterface for LinalgOps. This is
+/// templated on the actual Linalg named op for now since the registration of
+/// the external model requires the original operation.
+template <typename LinalgOpTy>
+struct LinalgOpTilingInterface
+    : public TilingInterface::ExternalModel<LinalgOpTilingInterface<LinalgOpTy>,
+                                            LinalgOpTy> {
+  SmallVector<Value> getDestinationOperands(Operation *op, OpBuilder &b) const {
+    LinalgOp linalgOp = cast<LinalgOp>(op);
+    return linalgOp.getOutputOperands();
+  }
+
+  SmallVector<StringRef> getLoopIteratorTypes(Operation *op) const {
+    LinalgOp linalgOp = cast<LinalgOp>(op);
+    SmallVector<StringRef> iteratorTypes;
+    iteratorTypes.reserve(linalgOp.iterator_types().size());
+    for (Attribute iteratorAttr : linalgOp.iterator_types()) {
+      iteratorTypes.push_back(iteratorAttr.cast<StringAttr>().getValue());
+    }
+    return iteratorTypes;
+  }
+
+  SmallVector<Range> getIterationDomain(Operation *op, OpBuilder &b) const {
+    LinalgOp linalgOp = cast<LinalgOp>(op);
+    return linalgOp.createLoopRanges(b, op->getLoc());
+  }
+
+  SmallVector<Operation *> getTiledImplementation(
+      Operation *op, OpBuilder &b, ValueRange tiledDest,
+      ArrayRef<OpFoldResult> offsets, ArrayRef<OpFoldResult> sizes,
+      bool tileDestOperands) const {
+    LinalgOp linalgOp = cast<LinalgOp>(op);
+    if (op->getNumResults() != 1) {
+      // TODO: Need a failure message here, but `notifyMatchFailure` is only a
+      // method on `PatternRewriter`.
+      return {};
+    }
+    Location loc = op->getLoc();
+    AffineMap shapeSizesToLoopsMap = linalgOp.getShapesToLoopsMap();
+    auto allShapeSizes = linalgOp.createFlatListOfOperandDims(b, loc);
+    if (!shapeSizesToLoopsMap) return {};
+
+    OpOperand *outOperand = linalgOp.getOutputOperand(0);
+    AffineMap indexingMap = linalgOp.getTiedIndexingMap(outOperand);
+    if (!indexingMap.isProjectedPermutation()) return {};
+
+    SmallVector<Value> offsetsVals = getAsValues(b, loc, offsets);
+    SmallVector<Value> sizeVals = getAsValues(b, loc, sizes);
+    SmallVector<Value> sizeBounds =
+        applyMapToValues(b, loc, shapeSizesToLoopsMap, allShapeSizes);
+
+    // The offsets and sizes form the slice operation only give you the tile
+    // size of the output. Use that compute the tile sizes and offsets of the
+    // loops. For loops not used to access the output, set the tile sizes to
+    // loop bounds and set the offset to 0.
+    Value zero = b.create<arith::ConstantIndexOp>(loc, 0);
+    SmallVector<Value> tileOffsets(sizeBounds.size(), zero);
+    SmallVector<Value> tileSizes = sizeBounds;
+    for (auto result : enumerate(indexingMap.getResults())) {
+      unsigned position = result.value().cast<AffineDimExpr>().getPosition();
+      tileOffsets[position] = offsetsVals[result.index()];
+      tileSizes[position] = sizeVals[result.index()];
+    }
+
+    SmallVector<Value> valuesToTile = linalgOp.getInputOperands();
+    SmallVector<Value> tiledOperands;
+    if (tileDestOperands) {
+      // Append the outputs then tile both the inputs and outputs.
+      valuesToTile.append(tiledDest.begin(), tiledDest.end());
+      tiledOperands = makeTiledShapes(b, loc, linalgOp, valuesToTile,
+                                      tileOffsets, tileSizes, sizeBounds);
+    } else {
+      // Only tile the inputs, then apped the outputs.
+      int64_t dim = offsets.size();
+      ArrayRef<Value> tileOffsetsRef{tileOffsets.begin(), tileOffsets.end()};
+      ArrayRef<Value> tileSizesRef{tileSizes.begin(), tileSizes.end()};
+      tiledOperands = makeTiledInputShapes(
+          b, loc, linalgOp, valuesToTile, tileOffsetsRef.take_front(dim + 1),
+          tileSizesRef.take_front(dim + 1), sizeBounds);
+      tiledOperands.append(tiledDest.begin(), tiledDest.end());
+    }
+    return {linalgOp.clone(b, loc, tiledDest.getTypes(), tiledOperands)};
+  }
+};
+}  // namespace
+
+template <typename OpType>
+void registerOne(DialectRegistry &registry) {
+  registry.addOpInterface<OpType, LinalgOpTilingInterface<OpType>>();
+}
+
+/// Variadic helper function.
+template <typename... OpTypes>
+void registerAll(DialectRegistry &registry) {
+  // FIXME: In c++17 this can be simplified by using 'fold expressions'.
+  (void)std::initializer_list<int>{0, (registerOne<OpTypes>(registry), 0)...};
+}
+
+#define GET_OP_LIST
+
+void mlir::iree_compiler::IREE::LinalgExt::
+    registerTilingInterfaceExternalModels(DialectRegistry &registry) {
+  registerOne<linalg::GenericOp>(registry);
+  registerAll<
+#include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc"
+      >(registry);
+}
diff --git a/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Transforms/TilingToTileOp.cpp b/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Transforms/TilingToTileOp.cpp
new file mode 100644
index 0000000..ba8cc4d
--- /dev/null
+++ b/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Transforms/TilingToTileOp.cpp
@@ -0,0 +1,106 @@
+//===- TilingToTileOp.cpp - Tiling using to TileOp TilingInterface --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.h"
+#include "iree-dialects/Dialect/LinalgExt/Transforms/Transforms.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/Linalg/Utils/Utils.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+using namespace mlir;
+using namespace mlir::iree_compiler::IREE::LinalgExt;
+
+struct TilingResult {
+  TileOp tileOp;
+  Operation *tiledOp;
+};
+
+static TilingResult tileToTileOp(PatternRewriter &rewriter, TilingInterface op,
+                                 int64_t tiledDim, Value tileSize) {
+  Location loc = op->getLoc();
+  OpBuilder::InsertionGuard g(rewriter);
+  // TODO: Handle the case where the `loopRanges` are empty.
+  SmallVector<Range> loopRanges = op.getIterationDomain(rewriter);
+  assert(loopRanges.size() >= 1 &&
+         "expected at least a single loop in operation");
+  auto destOperands = op.getDestinationOperands(rewriter);
+  Operation *tiledOp = nullptr;
+  auto tileOp = rewriter.create<TileOp>(
+      loc, tileSize, destOperands, tiledDim,
+      [&](OpBuilder &b, Location loc, Value offset, Value size,
+          ValueRange outSlices) {
+        // TODO: support `getTiledImplementation` with >1 produced tiled ops.
+        int64_t nLoops = loopRanges.size();
+        SmallVector<OpFoldResult> tiledOffsets, tiledSizes;
+        tiledOffsets.reserve(nLoops);
+        tiledSizes.reserve(nLoops);
+        for (unsigned i = 0; i < nLoops; ++i) {
+          if (i == tiledDim) {
+            tiledOffsets.push_back(offset);
+            tiledSizes.push_back(size);
+          } else {
+            tiledOffsets.push_back(loopRanges[i].offset);
+            tiledSizes.push_back(loopRanges[i].size);
+          }
+        }
+        SmallVector<Operation *> tiledOps = op.getTiledImplementation(
+            b, outSlices, tiledOffsets, tiledSizes, /*tileDestOperands=*/false);
+        assert(tiledOps.size() == 1 && "expected single tiled op");
+        tiledOp = tiledOps.front();
+        b.create<TileYieldOp>(loc, tiledOp->getResults());
+      });
+  return TilingResult{tileOp, tiledOp};
+}
+
+FailureOr<Operation *> mlir::iree_compiler::IREE::LinalgExt::
+    LinalgExtTilingPattern::returningMatchAndRewrite(
+        TilingInterface op, PatternRewriter &rewriter) const {
+  /// Currently only handle single result operations.
+  if (op->getNumResults() != 1)
+    return rewriter.notifyMatchFailure(op, "Not a single result");
+
+  // Get rank and tile sizes.
+  // TODO: consider moving these checks to a common place that the TransformOp
+  // verifier can also use.
+  SmallVector<Value> tileSizes =
+      options.tileSizeComputationFunction(rewriter, op);
+  int64_t dim = -1;
+  for (auto en : llvm::enumerate(tileSizes)) {
+    Optional<int64_t> maybeTileSize = getConstantIntValue(en.value());
+    if (maybeTileSize && *maybeTileSize == 0) continue;
+    if (maybeTileSize && *maybeTileSize < 0)
+      return rewriter.notifyMatchFailure(op, "Negative tile size");
+    if (dim >= 0)
+      return rewriter.notifyMatchFailure(op,
+                                         "Could not find a single tiling dim");
+    dim = en.index();
+  }
+  if (dim < 0)
+    return rewriter.notifyMatchFailure(op,
+                                       "Could not find a single tiling dim");
+
+  /// Currently only handle tiling operations on a parallel iterator type.
+  auto loopIteratorTypes = op.getLoopIteratorTypes();
+  // Scalar operation, nothing to do, so just return.
+  if (loopIteratorTypes.empty())
+    return rewriter.notifyMatchFailure(op, "Scalar op, no tiling possible");
+  ArrayRef<StringRef> loopIteratorTypesRef(loopIteratorTypes);
+  if (loopIteratorTypesRef[dim] != getParallelIteratorTypeName())
+    return rewriter.notifyMatchFailure(op, "Trying to tile a non-parallel dim");
+
+  TilingResult tilingResult = tileToTileOp(rewriter, op, dim, tileSizes[dim]);
+  rewriter.replaceOp(op, tilingResult.tileOp->getResults());
+
+  return tilingResult.tiledOp;
+}
diff --git a/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Transforms/Utils.cpp b/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Transforms/Utils.cpp
new file mode 100644
index 0000000..9b250b8
--- /dev/null
+++ b/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Transforms/Utils.cpp
@@ -0,0 +1,104 @@
+//===- Utils.cpp - LinalgExt transform utils ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "iree-dialects/Dialect/LinalgExt/Transforms/Utils.h"
+
+#include "iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+using namespace mlir;
+using namespace mlir::iree_compiler::IREE::LinalgExt;
+
+void mlir::iree_compiler::IREE::LinalgExt::completeOffsetsSizesAndStrides(
+    OpBuilder &b, Location loc, Value tensor, ArrayRef<Value> leadingOffsets,
+    ArrayRef<Value> leadingSizes, ArrayRef<Value> leadingStrides,
+    SmallVectorImpl<Value> &offsets, SmallVectorImpl<Value> &sizes,
+    SmallVectorImpl<Value> &strides) {
+  assert(leadingOffsets.size() == leadingSizes.size() &&
+         "expected matching lengths");
+  assert(leadingSizes.size() == leadingStrides.size() &&
+         "expected matching lengths");
+
+  auto rankedTensorType = tensor.getType().cast<RankedTensorType>();
+  int64_t tensorRank = rankedTensorType.getRank();
+  int64_t leadingRank = leadingOffsets.size();
+  offsets = SmallVector<Value>(leadingOffsets.begin(), leadingOffsets.end());
+  sizes = SmallVector<Value>(leadingSizes.begin(), leadingSizes.end());
+  strides = SmallVector<Value>(leadingStrides.begin(), leadingStrides.end());
+  if (leadingRank >= tensorRank) return;
+  Value zero = b.create<arith::ConstantIndexOp>(loc, 0);
+  Value one = b.create<arith::ConstantIndexOp>(loc, 1);
+  for (int64_t i = leadingRank, e = tensorRank; i < e; ++i) {
+    offsets.push_back(zero);
+    sizes.push_back(b.createOrFold<tensor::DimOp>(loc, tensor, i));
+    strides.push_back(one);
+  }
+}
+
+/// Create a tensor::ExtractSliceOp by auto-completing the missing trailing
+/// dimensions to always be offset = 0, size = dim, stride = 1.
+Value mlir::iree_compiler::IREE::LinalgExt::
+    createSubsetExtractOpFromLeadingOffsetsSizesAndStrides(
+        OpBuilder &b, Location loc, Value tensor,
+        ArrayRef<Value> leadingOffsets, ArrayRef<Value> leadingSizes,
+        ArrayRef<Value> leadingStrides) {
+  SmallVector<Value> offsets, sizes, strides;
+  completeOffsetsSizesAndStrides(b, loc, tensor, leadingOffsets, leadingSizes,
+                                 leadingStrides, offsets, sizes, strides);
+  return b.createOrFold<tensor::ExtractSliceOp>(loc, tensor, offsets, sizes,
+                                                strides);
+}
+
+/// Create a tensor::InsertSliceOp by auto-completing the missing trailing
+/// dimensions to always be offset = 0, size = dim, stride = 1.
+Value mlir::iree_compiler::IREE::LinalgExt::
+    createSubsetInsertOpFromLeadingOffsetsSizesAndStrides(
+        OpBuilder &b, Location loc, Value tensor, Value dest,
+        ArrayRef<Value> leadingOffsets, ArrayRef<Value> leadingSizes,
+        ArrayRef<Value> leadingStrides) {
+  SmallVector<Value> offsets, sizes, strides;
+  completeOffsetsSizesAndStrides(b, loc, tensor, leadingOffsets, leadingSizes,
+                                 leadingStrides, offsets, sizes, strides);
+  return b.createOrFold<tensor::InsertSliceOp>(loc, tensor, dest, offsets,
+                                               sizes, strides);
+}
+
+/// Create a iree_compiler::IREE::LinalgExt::ParallelInsertSliceOp by
+/// auto-completing the missing trailing dimensions to always be offset = 0,
+/// size = dim, stride = 1.
+Operation *mlir::iree_compiler::IREE::LinalgExt::
+    createParallelInsertSliceOpFromLeadingOffsetsSizesAndStrides(
+        OpBuilder &b, Location loc, Value tensor, Value dest,
+        ArrayRef<Value> leadingOffsets, ArrayRef<Value> leadingSizes,
+        ArrayRef<Value> leadingStrides) {
+  SmallVector<Value> offsets, sizes, strides;
+  completeOffsetsSizesAndStrides(b, loc, tensor, leadingOffsets, leadingSizes,
+                                 leadingStrides, offsets, sizes, strides);
+  return b.createOrFold<iree_compiler::IREE::LinalgExt::ParallelInsertSliceOp>(
+      loc, tensor, dest, offsets, sizes, strides);
+}
+
+/// Insert the `source` tensor into the `dest` tensor by creating the relevant
+/// `subset_insert` op. The details of the `subset_insert` op are retrieved
+/// from the `subset_extract` op so that they form a matching extract/insert
+/// pair.
+Value mlir::iree_compiler::IREE::LinalgExt::createMatchingSubsetInsertOp(
+    OpBuilder &b, Location loc, tensor::ExtractSliceOp subsetExtractOp,
+    Value source, Value dest) {
+  return b.create<tensor::InsertSliceOp>(
+      loc, subsetExtractOp.source().getType(), source, dest,
+      subsetExtractOp.offsets(), subsetExtractOp.sizes(),
+      subsetExtractOp.strides(), subsetExtractOp.static_offsets(),
+      subsetExtractOp.static_sizes(), subsetExtractOp.static_strides());
+}
diff --git a/integrations/tensorflow/iree-dialects/lib/Dialect/PyDM/IR/PyDMDialect.cpp b/integrations/tensorflow/iree-dialects/lib/Dialect/PyDM/IR/PyDMDialect.cpp
index 1915da7..82381bc 100644
--- a/integrations/tensorflow/iree-dialects/lib/Dialect/PyDM/IR/PyDMDialect.cpp
+++ b/integrations/tensorflow/iree-dialects/lib/Dialect/PyDM/IR/PyDMDialect.cpp
@@ -33,7 +33,10 @@
 using PyBoolType = PYDM::BoolType;
 using PyConstantOp = PYDM::ConstantOp;
 using PyIntegerType = PYDM::IntegerType;
+using PyListType = PYDM::ListType;
 using PyRealType = PYDM::RealType;
+using PyObjectType = PYDM::ObjectType;
+using PyUnionType = PYDM::UnionType;
 
 void IREEPyDMDialect::initialize() {
   addTypes<
@@ -115,6 +118,49 @@
   return emitError() << "unsupported python integer bit width: " << w;
 }
 
+Type PyIntegerType::parse(mlir::AsmParser &parser) {
+  MLIRContext *ctxt = parser.getContext();
+  auto emitError = [&]() -> InFlightDiagnostic {
+    return parser.emitError(parser.getCurrentLocation());
+  };
+  // Weak
+  if (failed(parser.parseOptionalLess())) return get(ctxt);
+  // AP
+  if (succeeded(parser.parseOptionalStar())) {
+    if (failed(parser.parseGreater())) return Type();
+    return get(ctxt, None);
+  }
+
+  // Explicit
+  bool isSigned;
+  if (succeeded(parser.parseOptionalKeyword("unsigned"))) {
+    isSigned = false;
+  } else {
+    isSigned = true;
+  }
+
+  int width;
+  if (failed(parser.parseInteger(width))) return Type();
+  if (failed(parser.parseGreater())) return Type();
+  if (!isSigned) width = -width;
+  return getChecked(emitError, ctxt, width);
+}
+
+void PyIntegerType::print(mlir::AsmPrinter &printer) const {
+  auto w = getImpl()->bitWidth;
+  if (w) {
+    printer << "<";
+    if (*w == 0) {
+      printer << "*";
+    } else if (*w > 0) {
+      printer << *w;
+    } else {
+      printer << "unsigned " << (-*w);
+    }
+    printer << ">";
+  }
+}
+
 BuiltinTypeCode PYDM::IntegerType::getTypeCode() const {
   return static_cast<BuiltinTypeCode>(
       makeNumericTypeCode(*getNumericCategory(), *getNumericSubTypeCode()));
@@ -170,6 +216,57 @@
 }
 
 // ListType
+void PyListType::print(mlir::AsmPrinter &printer) const {
+  if (getImpl()->uniformElementType ||
+      getImpl()->storageClass != CollectionStorageClass::Boxed) {
+    printer << "<";
+    switch (getImpl()->storageClass) {
+      case CollectionStorageClass::Boxed:
+        printer << "boxed";
+        break;
+      case CollectionStorageClass::Empty:
+        printer << "empty";
+        break;
+      case CollectionStorageClass::Unboxed:
+        printer << "unboxed";
+        break;
+    }
+
+    if (getImpl()->uniformElementType) {
+      printer << ",";
+      printer << getImpl()->uniformElementType;
+    }
+    printer << ">";
+  }
+}
+
+Type PyListType::parse(mlir::AsmParser &parser) {
+  MLIRContext *ctxt = parser.getContext();
+  if (parser.parseOptionalLess())
+    return get(ctxt, CollectionStorageClass::Boxed, nullptr);
+
+  Type t;
+  StringRef storageClassKeyword;
+  if (parser.parseKeyword(&storageClassKeyword)) return Type();
+  if (parser.parseComma()) return Type();
+  if (parser.parseType(t)) return Type();
+  if (parser.parseGreater()) return Type();
+
+  CollectionStorageClass storageClass;
+  if (storageClassKeyword == "boxed")
+    storageClass = CollectionStorageClass::Boxed;
+  else if (storageClassKeyword == "empty")
+    storageClass = CollectionStorageClass::Empty;
+  else if (storageClassKeyword == "unboxed")
+    storageClass = CollectionStorageClass::Unboxed;
+  else {
+    parser.emitError(parser.getCurrentLocation(),
+                     "expected one of 'boxed', 'empty', 'unboxed'");
+    return Type();
+  }
+  return get(ctxt, storageClass, t);
+}
+
 StringRef PYDM::ListType::getPythonTypeName() const { return "list"; }
 
 BuiltinTypeCode PYDM::NoneType::getTypeCode() const {
@@ -206,6 +303,26 @@
 StringRef PYDM::NoneType::getPythonTypeName() const { return "None"; }
 
 // ObjectType
+void PyObjectType::print(mlir::AsmPrinter &printer) const {
+  if (getImpl()->primitiveType)
+    printer << "<" << getImpl()->primitiveType << ">";
+}
+
+Type PyObjectType::parse(mlir::AsmParser &parser) {
+  MLIRContext *ctxt = parser.getContext();
+  if (parser.parseOptionalLess()) return get(ctxt, nullptr);
+
+  Type t;
+  if (parser.parseType(t)) return Type();
+  if (parser.parseGreater()) return Type();
+  if (auto primitiveType = t.dyn_cast<PrimitiveType>())
+    return get(ctxt, primitiveType);
+  else {
+    parser.emitError(parser.getNameLoc(), "expected a primitive type");
+    return Type();
+  }
+}
+
 BuiltinTypeCode PYDM::ObjectType::getTypeCode() const {
   return BuiltinTypeCode::Object;
 }
@@ -222,6 +339,26 @@
 }
 
 // RealType
+void PyRealType::print(mlir::AsmPrinter &printer) const {
+  auto ft = getImpl()->floatType;
+  if (ft) printer << "<" << ft << ">";
+}
+
+Type PyRealType::parse(mlir::AsmParser &parser) {
+  MLIRContext *ctxt = parser.getContext();
+
+  auto emitError = [&]() -> InFlightDiagnostic {
+    return parser.emitError(parser.getCurrentLocation());
+  };
+  // Weak
+  if (failed(parser.parseOptionalLess())) return get(ctxt);
+  // Explicit
+  FloatType subType;
+  if (failed(parser.parseType(subType))) return Type();
+  if (failed(parser.parseGreater())) return Type();
+  return getChecked(emitError, ctxt, subType);
+}
+
 LogicalResult PYDM::RealType::verify(
     function_ref<InFlightDiagnostic()> emitError, FloatType floatType) {
   if (!floatType) return success();
@@ -295,6 +432,26 @@
 // Union type implementation
 //------------------------------------------------------------------------------
 
+void PyUnionType::print(mlir::AsmPrinter &printer) const {
+  llvm::interleaveComma(getAlternatives(), printer);
+}
+
+Type PyUnionType::parse(mlir::AsmParser &parser) {
+  MLIRContext *ctxt = parser.getContext();
+  if (parser.parseOptionalLess()) return get(ctxt, {});
+
+  SmallVector<::mlir::Type> alternatives;
+
+  do {
+    Type type;
+    if (parser.parseType(type)) return Type();
+    alternatives.push_back(type);
+  } while (succeeded(parser.parseOptionalComma()));
+
+  return getChecked([&]() { return parser.emitError(parser.getNameLoc()); },
+                    ctxt, alternatives);
+}
+
 LogicalResult PYDM::UnionType::verify(
     llvm::function_ref<InFlightDiagnostic()> emitError,
     ArrayRef<Type> alternatives) {
diff --git a/integrations/tensorflow/iree-dialects/lib/Dialect/PyDM/IR/PyDMOps.cpp b/integrations/tensorflow/iree-dialects/lib/Dialect/PyDM/IR/PyDMOps.cpp
index add7abc..2010688 100644
--- a/integrations/tensorflow/iree-dialects/lib/Dialect/PyDM/IR/PyDMOps.cpp
+++ b/integrations/tensorflow/iree-dialects/lib/Dialect/PyDM/IR/PyDMOps.cpp
@@ -29,8 +29,6 @@
 using PyCallOp = PYDM::CallOp;
 using PyFuncOp = PYDM::FuncOp;
 
-static LogicalResult verify(Operation *) { return success(); }
-
 //===----------------------------------------------------------------------===//
 // Utilities
 //===----------------------------------------------------------------------===//
@@ -439,14 +437,13 @@
 
 ::llvm::StringRef FunctionalIfOp::getDefaultDialect() { return "iree_pydm"; }
 
-static LogicalResult verify(FunctionalIfOp op) {
-  if (op.getNumResults() != 0 && op.elseRegion().empty())
-    return op.emitOpError("must have an else block if defining values");
+LogicalResult FunctionalIfOp::verify() {
+  if (getNumResults() != 0 && elseRegion().empty())
+    return emitOpError("must have an else block if defining values");
   return success();
 }
 
-static ParseResult parseFunctionalIfOp(OpAsmParser &parser,
-                                       OperationState &result) {
+ParseResult FunctionalIfOp::parse(OpAsmParser &parser, OperationState &result) {
   // Create the regions for 'then'.
   result.regions.reserve(2);
   Region *thenRegion = result.addRegion();
@@ -478,7 +475,8 @@
   return success();
 }
 
-static void print(OpAsmPrinter &p, FunctionalIfOp op) {
+void FunctionalIfOp::print(OpAsmPrinter &p) {
+  FunctionalIfOp op = *this;
   bool printBlockTerminators = false;
 
   p << " " << op.condition();
@@ -546,7 +544,7 @@
   return success();
 }
 
-static ParseResult parseFuncOp(OpAsmParser &parser, OperationState &result) {
+ParseResult PyFuncOp::parse(OpAsmParser &parser, OperationState &result) {
   auto buildFuncType =
       [](Builder &builder, ArrayRef<Type> argTypes, ArrayRef<Type> results,
          function_interface_impl::VariadicFlag,
@@ -556,45 +554,40 @@
       parser, result, /*allowVariadic=*/false, buildFuncType);
 }
 
-static void print(PyFuncOp op, OpAsmPrinter &p) {
-  FunctionType fnType = op.getType();
+void PyFuncOp::print(OpAsmPrinter &p) {
+  FunctionType fnType = getType();
   function_interface_impl::printFunctionOp(
-      p, op, fnType.getInputs(), /*isVariadic=*/false, fnType.getResults());
-}
-
-static LogicalResult verify(PyFuncOp op) {
-  // TODO: Enforce invariants.
-  return success();
+      p, *this, fnType.getInputs(), /*isVariadic=*/false, fnType.getResults());
 }
 
 //===----------------------------------------------------------------------===//
 // MakeListOp
 //===----------------------------------------------------------------------===//
 
-static LogicalResult verify(MakeListOp op) {
-  auto listType = op.list().getType().cast<ListType>();
+LogicalResult MakeListOp::verify() {
+  auto listType = list().getType().cast<ListType>();
   switch (listType.getStorageClass()) {
     case CollectionStorageClass::Boxed:
-      for (auto element : op.elements()) {
+      for (auto element : elements()) {
         if (!element.getType().isa<ObjectType>()) {
-          return op.emitOpError() << "making a list with boxed storage class "
-                                     "must have object elements. Got: "
-                                  << element.getType();
+          return emitOpError() << "making a list with boxed storage class "
+                                  "must have object elements. Got: "
+                               << element.getType();
         }
       }
       break;
     case CollectionStorageClass::Unboxed:
-      for (auto element : op.elements()) {
+      for (auto element : elements()) {
         if (element.getType().isa<ObjectType>()) {
-          return op.emitOpError() << "making a list with unboxed storage class "
-                                     "must not have object elements. Got: "
-                                  << element.getType();
+          return emitOpError() << "making a list with unboxed storage class "
+                                  "must not have object elements. Got: "
+                               << element.getType();
         }
       }
       break;
     case CollectionStorageClass::Empty:
-      if (!op.elements().empty()) {
-        return op.emitOpError()
+      if (!elements().empty()) {
+        return emitOpError()
                << "making a list with empty storage class must have zero "
                   "elements";
       }
diff --git a/integrations/tensorflow/iree-dialects/lib/Dialect/PyDM/Transforms/Optimize/LocalPropagateTypes.cpp b/integrations/tensorflow/iree-dialects/lib/Dialect/PyDM/Transforms/Optimize/LocalPropagateTypes.cpp
index 099aba7..0836591 100644
--- a/integrations/tensorflow/iree-dialects/lib/Dialect/PyDM/Transforms/Optimize/LocalPropagateTypes.cpp
+++ b/integrations/tensorflow/iree-dialects/lib/Dialect/PyDM/Transforms/Optimize/LocalPropagateTypes.cpp
@@ -187,7 +187,7 @@
   // cache, it is possible to refinements that include type cycles in the CFG.
   void permuteRefinedBlocks(PermutedTypePropagator &propagator) {
     SmallVector<Block *> blocks;
-    for (auto &block : getOperation().getBodyRegion()) {
+    for (auto &block : getOperation().body()) {
       blocks.push_back(&block);
     }
 
diff --git a/integrations/tensorflow/iree-dialects/python/CMakeLists.txt b/integrations/tensorflow/iree-dialects/python/CMakeLists.txt
index c29cd84..724982b 100644
--- a/integrations/tensorflow/iree-dialects/python/CMakeLists.txt
+++ b/integrations/tensorflow/iree-dialects/python/CMakeLists.txt
@@ -26,6 +26,14 @@
 declare_mlir_dialect_python_bindings(
   ADD_TO_PARENT IREEDialectsPythonSources.Dialects
   ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/iree/compiler"
+  TD_FILE dialects/IreeLinalgExtBinding.td
+  SOURCES dialects/iree_linalg_ext.py
+  DIALECT_NAME iree_linalg_ext
+)
+
+declare_mlir_dialect_python_bindings(
+  ADD_TO_PARENT IREEDialectsPythonSources.Dialects
+  ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/iree/compiler"
   TD_FILE dialects/IreePyDmBinding.td
   SOURCES
     dialects/_iree_pydm_ops_ext.py
@@ -63,6 +71,7 @@
   MLIRPythonSources.Core
   MLIRPythonSources.Dialects.builtin
   MLIRPythonSources.Dialects.func
+  MLIRPythonSources.Dialects.cf
   MLIRPythonSources.Passes
   IREEDialectsPythonSources
   IREEDialectsPythonExtensions
diff --git a/integrations/tensorflow/iree-dialects/python/IREEDialectsModule.cpp b/integrations/tensorflow/iree-dialects/python/IREEDialectsModule.cpp
index b3efba8..3647c47 100644
--- a/integrations/tensorflow/iree-dialects/python/IREEDialectsModule.cpp
+++ b/integrations/tensorflow/iree-dialects/python/IREEDialectsModule.cpp
@@ -90,6 +90,21 @@
       py::arg("context") = py::none(), py::arg("load") = true);
 
   //===--------------------------------------------------------------------===//
+  // IREELinalgExt
+  //===--------------------------------------------------------------------===//
+  auto iree_linalg_ext_m = m.def_submodule("iree_linalg_ext");
+  iree_linalg_ext_m.def(
+      "register_dialect",
+      [](MlirContext context, bool load) {
+        MlirDialectHandle handle = mlirGetDialectHandle__iree_linalg_ext__();
+        mlirDialectHandleRegisterDialect(handle, context);
+        if (load) {
+          mlirDialectHandleLoadDialect(handle, context);
+        }
+      },
+      py::arg("context") = py::none(), py::arg("load") = true);
+
+  //===--------------------------------------------------------------------===//
   // IREEPyDMDialect
   //===--------------------------------------------------------------------===//
   auto iree_pydm_m = m.def_submodule("iree_pydm");
diff --git a/integrations/tensorflow/iree-dialects/python/iree/compiler/dialects/IreeLinalgExtBinding.td b/integrations/tensorflow/iree-dialects/python/iree/compiler/dialects/IreeLinalgExtBinding.td
new file mode 100644
index 0000000..da2ceae
--- /dev/null
+++ b/integrations/tensorflow/iree-dialects/python/iree/compiler/dialects/IreeLinalgExtBinding.td
@@ -0,0 +1,13 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef PYTHON_BINDINGS_IREE_LINALGEXT_OPS
+#define PYTHON_BINDINGS_IREE_LINALGEXT_OPS
+
+include "mlir/Bindings/Python/Attributes.td"
+include "iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.td"
+
+#endif // PYTHON_BINDINGS_IREE_LINALGEXT_OPS
diff --git a/integrations/tensorflow/iree-dialects/python/iree/compiler/dialects/iree_linalg_ext.py b/integrations/tensorflow/iree-dialects/python/iree/compiler/dialects/iree_linalg_ext.py
new file mode 100644
index 0000000..01fb430
--- /dev/null
+++ b/integrations/tensorflow/iree-dialects/python/iree/compiler/dialects/iree_linalg_ext.py
@@ -0,0 +1,8 @@
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from ._iree_linalg_ext_ops_gen import *
+from .._mlir_libs._ireeDialects.iree_linalg_ext import *
diff --git a/integrations/tensorflow/iree-dialects/test/iree_linalgext/canonicalize.mlir b/integrations/tensorflow/iree-dialects/test/iree_linalgext/canonicalize.mlir
index acb8344..b8434d2 100644
--- a/integrations/tensorflow/iree-dialects/test/iree_linalgext/canonicalize.mlir
+++ b/integrations/tensorflow/iree-dialects/test/iree_linalgext/canonicalize.mlir
@@ -19,3 +19,24 @@
 
   return %1: tensor<3x5xi32>
 }
+
+// CHECK-LABEL: func @canonicalize_insert_slice_indices(
+//  CHECK-SAME:     %[[arg0:.*]]: tensor<?x?xf32>, %[[arg1:.*]]: tensor<?x?xf32>,
+//  CHECK-SAME:     %[[idx:.*]]: index
+func @canonicalize_insert_slice_indices(
+    %arg0 : tensor<?x?xf32>, %arg1: tensor<?x?xf32>,
+    %idx : index) -> tensor<?x?xf32>
+{
+  %cst = arith.constant 4.200000e+01 : f32
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+
+  %2 = iree_linalg_ext.in_parallel %idx  -> (tensor<?x?xf32>) {
+    ^bb0(%arg3: index):  // no predecessors
+      iree_linalg_ext.perform_concurrently {
+        // CHECK: iree_linalg_ext.parallel_insert_slice %[[arg0]] into %arg1[%[[idx]], 0] [1, 5] [1, 1]
+        iree_linalg_ext.parallel_insert_slice %arg0 into %arg1[%idx, %c0] [%c1, 5] [%c1, %c1] : tensor<?x?xf32> into tensor<?x?xf32>
+      }
+  }
+  return %2 : tensor<?x?xf32>
+}
diff --git a/integrations/tensorflow/iree-dialects/test/iree_linalgext/invalid.mlir b/integrations/tensorflow/iree-dialects/test/iree_linalgext/invalid.mlir
index bb6b37d..517e9c2 100644
--- a/integrations/tensorflow/iree-dialects/test/iree_linalgext/invalid.mlir
+++ b/integrations/tensorflow/iree-dialects/test/iree_linalgext/invalid.mlir
@@ -105,18 +105,18 @@
 
 // -----
 
-func @scatter_mixed_tensor_memref(
+func @scatter_output_type_mismatch(
     %update : tensor<?x?xf32>, %indices : tensor<?x1xi32>,
-    %original : tensor<?x?xf32>) -> memref<?x?xf32> {
-  // expected-error @+1 {{expected type of `outs` operand #0 'tensor<?x?xf32>' to be same as result type 'memref<?x?xf32>'}}
+    %original : tensor<?x?xf32>) -> tensor<4x?xf32> {
+  // expected-error @+1 {{expected type of `outs` operand #0 'tensor<?x?xf32>' to be same as result type 'tensor<4x?xf32>'}}
   %0 = iree_linalg_ext.scatter unique_indices(true)
       ins(%update, %indices : tensor<?x?xf32>, tensor<?x1xi32>)
       outs(%original : tensor<?x?xf32>) {
       ^bb0(%arg1: f32, %arg2: f32):
         %1 = arith.addf %arg1, %arg2 : f32
         iree_linalg_ext.yield %1 : f32
-      } -> memref<?x?xf32>
-  return %0 : memref<?x?xf32>
+      } -> tensor<4x?xf32>
+  return %0 : tensor<4x?xf32>
 }
 
 // -----
@@ -403,3 +403,46 @@
          outs(%init : tensor<3x5xi32>) : tensor<3x5xi32>
   return %0 : tensor<3x5xi32>
 }
+
+// -----
+
+func @not_enough_results() -> () {
+  %num_threads = arith.constant 100 : index
+  // expected-error@+1 {{'iree_linalg_ext.in_parallel' op produces 1 results, but its terminator yields 0 values}}
+  %result = iree_linalg_ext.in_parallel %num_threads -> tensor<100xf32> {
+    ^bb0(%thread_idx : index):
+      iree_linalg_ext.perform_concurrently {}
+  }
+}
+
+// -----
+
+func @too_many_results(%1 : tensor<1xf32>, %out : tensor<100xf32>) -> () {
+  %num_threads = arith.constant 100 : index
+  // expected-error@+1 {{'iree_linalg_ext.in_parallel' op produces 1 results, but its terminator yields 2 values}}
+  %result = iree_linalg_ext.in_parallel %num_threads -> tensor<100xf32> {
+    ^bb0(%thread_idx : index):
+      %0 = arith.constant 1 : index
+      iree_linalg_ext.perform_concurrently {
+        iree_linalg_ext.parallel_insert_slice %1 into %out[%thread_idx][%0][%0] :
+          tensor<1xf32> into tensor<100xf32>
+        iree_linalg_ext.parallel_insert_slice %1 into %out[%thread_idx][%0][%0] :
+          tensor<1xf32> into tensor<100xf32>
+      }
+  }
+}
+
+// -----
+
+func @type_mismatch(%1 : tensor<1xf32>, %out : tensor<200xf32>) -> () {
+  %num_threads = arith.constant 100 : index
+  // expected-error@+1 {{'iree_linalg_ext.in_parallel' op type mismatch between 0th result of in_parallel ('tensor<200xf32>') and 0th result yielded by its terminator ('tensor<100xf32>')}}
+  %result = iree_linalg_ext.in_parallel %num_threads -> tensor<100xf32> {
+    ^bb0(%thread_idx : index):
+      %0 = arith.constant 1 : index
+      iree_linalg_ext.perform_concurrently {
+        iree_linalg_ext.parallel_insert_slice %1 into %out[%thread_idx][%0][%0] :
+          tensor<1xf32> into tensor<200xf32>
+      }
+  }
+}
diff --git a/integrations/tensorflow/iree-dialects/test/iree_linalgext/roundtrip.mlir b/integrations/tensorflow/iree-dialects/test/iree_linalgext/roundtrip.mlir
index a4c9fc6..98b2c71 100644
--- a/integrations/tensorflow/iree-dialects/test/iree_linalgext/roundtrip.mlir
+++ b/integrations/tensorflow/iree-dialects/test/iree_linalgext/roundtrip.mlir
@@ -539,3 +539,58 @@
 //  CHECK-SAME:      dimensions(dense<[0, 1]> : tensor<2xi64>)
 //  CHECK-SAME:      ins(%[[ARG0]]
 //  CHECK-SAME:      outs(%[[INIT]]
+
+// -----
+
+// CHECK-LABEL: func @static_tile
+func @static_tile(%chunk_size: index, %in: tensor<?xf32>, %out: tensor<?xf32>, %out2: tensor<?xf32>) -> (tensor<?xf32>) {
+  %c0 = arith.constant 0: index
+  //%d0 = tensor.dim %out, %c0: tensor<?xf32>
+
+  // CHECK: iree_linalg_ext.tile %{{.*}} outs(%{{.*}}: tensor<?xf32>, %{{.*}}: tensor<?xf32>)
+  // CHECK: ^bb0(%{{.*}}: index, %{{.*}}: index, %{{.*}}: tensor<?xf32>, %{{.*}}: tensor<?xf32>):
+  %0:2 = iree_linalg_ext.tile %chunk_size outs(%out: tensor<?xf32>, %out2: tensor<?xf32>)
+      -> (tensor<?xf32>, tensor<?xf32>) {
+    // TODO: one offset and one size per tensor?
+    // If not necessary in the dense strided-array world, what about the rest?
+    ^bb0(%offset: index, %size: index, %st1: tensor<?xf32>, %st2: tensor<?xf32>):
+      // TODO: atm this is just 1-1: out-chunk-size -> in-size.
+      %1 = tensor.extract_slice %in[%offset][%size][1] : tensor<?xf32> to tensor<?xf32>
+      %3 = linalg.generic {
+           indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
+           iterator_types = ["parallel"]}
+         ins(%1: tensor<?xf32>) outs(%st1: tensor<?xf32>) {
+         ^bb0(%a: f32, %b:f32):  // no predecessors
+           %f42 = arith.constant 42.0: f32
+           %tmp = arith.mulf %a, %f42: f32
+           linalg.yield %tmp: f32
+      } -> tensor<?xf32>
+      iree_linalg_ext.tile_yield %3, %st2: tensor<?xf32>, tensor<?xf32> // assumes dim is 0 and stacks
+  }
+  return %0#0: tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @simple_example
+func @simple_example(%in: tensor<100xf32>, %out: tensor<100xf32>) -> (tensor<100xf32>) {
+  %num_threads = arith.constant 100 : index
+  %result = iree_linalg_ext.in_parallel %num_threads -> tensor<100xf32> {
+    ^bb0(%thread_idx : index):
+      %0 = arith.constant 0 : index
+      %1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32>
+      iree_linalg_ext.perform_concurrently {
+        iree_linalg_ext.parallel_insert_slice %1 into %out[%thread_idx][%0][%0] :
+          tensor<1xf32> into tensor<100xf32>
+      }
+  }
+  return %result : tensor<100xf32>
+}
+
+func @no_terminator() -> () {
+  %num_threads = arith.constant 100 : index
+  iree_linalg_ext.in_parallel %num_threads -> () {
+    ^bb0(%thread_idx : index):
+  }
+  return
+}
diff --git a/integrations/tensorflow/iree-dialects/test/iree_pydm/optimize/local_propagate_types.mlir b/integrations/tensorflow/iree-dialects/test/iree_pydm/optimize/local_propagate_types.mlir
index 466b5b5..46678d4 100644
--- a/integrations/tensorflow/iree-dialects/test/iree_pydm/optimize/local_propagate_types.mlir
+++ b/integrations/tensorflow/iree-dialects/test/iree_pydm/optimize/local_propagate_types.mlir
@@ -24,11 +24,11 @@
 // that the cast in the entry block is sunk into copies of ^bb1 and ^bb2, and
 // because of the donotoptimize op, the fully generic path of ^bb2 also must
 // be preserved (as ^bb3).
-// CHECK:   std.cond_br %arg0, ^bb1(%arg1 : !iree_pydm.object<!iree_pydm.integer<32>>), ^bb2(%arg1 : !iree_pydm.object<!iree_pydm.integer<32>>)
+// CHECK:   cf.cond_br %arg0, ^bb1(%arg1 : !iree_pydm.object<!iree_pydm.integer<32>>), ^bb2(%arg1 : !iree_pydm.object<!iree_pydm.integer<32>>)
 // CHECK: ^bb1(%[[BB1_PHI0:.*]]: !iree_pydm.object<!iree_pydm.integer<32>>): // pred: ^bb0
 // CHECK:   %[[BB1_V0:.*]] = static_info_cast %[[BB1_PHI0]] : !iree_pydm.object<!iree_pydm.integer<32>> -> !iree_pydm.object
 // CHECK:   %[[BB1_V1:.*]] = "custom.donotoptimize"(%[[BB1_V0]]) : (!iree_pydm.object) -> !iree_pydm.object
-// CHECK:   std.br ^bb3(%[[BB1_V1]] : !iree_pydm.object)
+// CHECK:   cf.br ^bb3(%[[BB1_V1]] : !iree_pydm.object)
 // CHECK: ^bb2(%[[BB2_PHI0:.*]]: !iree_pydm.object<!iree_pydm.integer<32>>): // pred: ^bb0
 // CHECK:   %[[BB2_V0:.*]] = make_list %[[BB2_PHI0]]
 // CHECK:   return %[[BB2_V0]]
@@ -37,10 +37,10 @@
 // CHECK:    return %[[BB3_V0]]
 iree_pydm.func @sink_static_info_cast_into_branch(%pred : i1, %arg0 : !iree_pydm.object<!iree_pydm.integer<32>>) -> (!iree_pydm.exception_result, !iree_pydm.list) {
   %0 = static_info_cast %arg0 : !iree_pydm.object<!iree_pydm.integer<32>> -> !iree_pydm.object
-  std.cond_br %pred, ^bb1(%0 : !iree_pydm.object), ^bb2(%0 : !iree_pydm.object)
+  cf.cond_br %pred, ^bb1(%0 : !iree_pydm.object), ^bb2(%0 : !iree_pydm.object)
 ^bb1(%phi0 : !iree_pydm.object):
   %1 = "custom.donotoptimize"(%phi0) : (!iree_pydm.object) -> (!iree_pydm.object)
-  std.br ^bb2(%1 : !iree_pydm.object)
+  cf.br ^bb2(%1 : !iree_pydm.object)
 ^bb2(%phi1 : !iree_pydm.object):
   %list = make_list %phi1 : !iree_pydm.object -> !iree_pydm.list
   return %list : !iree_pydm.list
diff --git a/integrations/tensorflow/iree-dialects/test/iree_pydm/optimize/variables_to_ssa.mlir b/integrations/tensorflow/iree-dialects/test/iree_pydm/optimize/variables_to_ssa.mlir
index c8b38fe..90feb7e 100644
--- a/integrations/tensorflow/iree-dialects/test/iree_pydm/optimize/variables_to_ssa.mlir
+++ b/integrations/tensorflow/iree-dialects/test/iree_pydm/optimize/variables_to_ssa.mlir
@@ -59,8 +59,8 @@
   // CHECK-NOT: store_var
   %a = alloc_free_var "a" -> !iree_pydm.free_var_ref
   store_var %a = %arg0 : !iree_pydm.free_var_ref, !iree_pydm.object
-  // CHECK: std.br ^bb1(%arg0 : !iree_pydm.object)
-  std.br ^bb1
+  // CHECK: cf.br ^bb1(%arg0 : !iree_pydm.object)
+  cf.br ^bb1
   // CHECK: ^bb1(%[[PHI:.*]]: !iree_pydm.object): // pred: ^bb0
 ^bb1:
   // CHECK-NOT: load_var
diff --git a/integrations/tensorflow/iree-dialects/test/iree_pydm/to_iree/structural.mlir b/integrations/tensorflow/iree-dialects/test/iree_pydm/to_iree/structural.mlir
index b863f80..04a4f9b 100644
--- a/integrations/tensorflow/iree-dialects/test/iree_pydm/to_iree/structural.mlir
+++ b/integrations/tensorflow/iree-dialects/test/iree_pydm/to_iree/structural.mlir
@@ -1,12 +1,12 @@
 // RUN: iree-dialects-opt -split-input-file -convert-iree-pydm-to-iree %s | FileCheck  --dump-input-filter=all %s
 
 // CHECK-LABEL: @bool_to_pred
-// NOTE: Also tests cond_br conversion.
+// NOTE: Also tests cf.cond_br conversion.
 iree_pydm.func @bool_to_pred(%arg0 : !iree_pydm.bool) -> (!iree_pydm.exception_result, !iree_pydm.none) {
   %0 = bool_to_pred %arg0
   %1 = none
-  // CHECK: cond_br %arg0
-  cond_br %0, ^bb1, ^bb2
+  // CHECK: cf.cond_br %arg0
+  cf.cond_br %0, ^bb1, ^bb2
 ^bb1:
   return %1 : !iree_pydm.none
 ^bb2:
@@ -17,8 +17,8 @@
 // CHECK-LABEL: @br
 iree_pydm.func @br() -> (!iree_pydm.exception_result, !iree_pydm.none) {
   %0 = none
-  // CHECK: br ^bb1({{.*}} : i32)
-  br ^bb1(%0 : !iree_pydm.none)
+  // CHECK: cf.br ^bb1({{.*}} : i32)
+  cf.br ^bb1(%0 : !iree_pydm.none)
   // CHECK: ^bb1(%0: i32):
 ^bb1(%1 : !iree_pydm.none):
   return %1 : !iree_pydm.none
@@ -71,14 +71,14 @@
   // CHECK: %[[NEEDED_TYPE_CODE:.*]] = arith.constant 78 : i32
   // CHECK: %[[TYPE_CODE:.*]] = iree_input.list.get %arg0[%[[c0]]] : !iree_input.list<!iree_input.variant> -> i32
   // CHECK: %[[TYPE_EQ:.*]] = arith.cmpi eq, %[[NEEDED_TYPE_CODE]], %[[TYPE_CODE]] : i32
-  // CHECK: cond_br %[[TYPE_EQ]], ^bb1, ^bb4
+  // CHECK: cf.cond_br %[[TYPE_EQ]], ^bb1, ^bb4
 
   // bb1: On equal
   // CHECK: ^bb1:
   // CHECK: %[[c1:.*]] = arith.constant 1 : index
   // CHECK: %[[c0_i32:.*]] = arith.constant 0 : i32
   // CHECK: %[[CONTENTS:.*]] = iree_input.list.get %arg0[%[[c1]]] : !iree_input.list<!iree_input.variant> -> i32
-  // CHECK: br ^bb2(%[[c0_i32]], %[[CONTENTS]] : i32, i32)
+  // CHECK: cf.br ^bb2(%[[c0_i32]], %[[CONTENTS]] : i32, i32)
 
   // bb2: Check status code (from raise_on_failure)
   // CHECK: ^bb2(%3: i32, %4: i32):  // 2 preds: ^bb1, ^bb4
@@ -90,7 +90,7 @@
   // CHECK: ^bb4:
   // CHECK: %[[VALUE_ERROR_CODE:.*]] = arith.constant -4 : i32
   // CHECK: %[[c0_i32_2:.*]] = arith.constant 0 : i32
-  // CHECK: br ^bb2(%[[VALUE_ERROR_CODE]], %[[c0_i32_2]] : i32, i32)
+  // CHECK: cf.br ^bb2(%[[VALUE_ERROR_CODE]], %[[c0_i32_2]] : i32, i32)
   %status, %primitive = unbox %arg0 : !iree_pydm.object -> !iree_pydm.integer<32>
   raise_on_failure %status : !iree_pydm.exception_result
   return %primitive : !iree_pydm.integer<32>
@@ -101,7 +101,7 @@
 iree_pydm.func @raise_on_failure_object_return(%arg0 : !iree_pydm.exception_result, %arg1: !iree_pydm.object) -> (!iree_pydm.exception_result, !iree_pydm.object) {
   // CHECK: %[[c0_i32:.*]] = arith.constant 0 : i32
   // CHECK: %[[CMP:.*]] = arith.cmpi eq, %[[c0_i32]], %arg0 : i32
-  // CHECK: cond_br %[[CMP]], ^bb1, ^bb2
+  // CHECK: cf.cond_br %[[CMP]], ^bb1, ^bb2
   // bb1: success
   // CHECK: ^bb1:
   // CHECK: %[[c0_i32_0:.*]] = arith.constant 0 : i32
@@ -199,18 +199,18 @@
 // CHECK:           %[[VAL_1:.*]] = arith.constant 2 : index
 // CHECK:           %[[VAL_2:.*]] = iree_input.list.size %[[VAL_0]] : !iree_input.list<!iree_input.variant>
 // CHECK:           %[[VAL_3:.*]] = arith.cmpi eq, %[[VAL_1]], %[[VAL_2]] : index
-// CHECK:           cond_br %[[VAL_3]], ^bb1, ^bb4
+// CHECK:           cf.cond_br %[[VAL_3]], ^bb1, ^bb4
 // CHECK:         ^bb1:
 // CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i32
 // CHECK:           %[[VAL_5:.*]] = arith.constant 0 : index
 // CHECK:           %[[VAL_6:.*]] = iree_input.list.get %[[VAL_0]]{{\[}}%[[VAL_5]]] : !iree_input.list<!iree_input.variant> -> i32
 // CHECK:           %[[VAL_7:.*]] = arith.constant 1 : index
 // CHECK:           %[[VAL_8:.*]] = iree_input.list.get %[[VAL_0]]{{\[}}%[[VAL_7]]] : !iree_input.list<!iree_input.variant> -> i1
-// CHECK:           br ^bb2(%[[VAL_4]], %[[VAL_6]], %[[VAL_8]] : i32, i32, i1)
+// CHECK:           cf.br ^bb2(%[[VAL_4]], %[[VAL_6]], %[[VAL_8]] : i32, i32, i1)
 // CHECK:         ^bb2(%[[VAL_9:.*]]: i32, %[[VAL_10:.*]]: i32, %[[VAL_11:.*]]: i1):
 // CHECK:           %[[VAL_12:.*]] = arith.constant 0 : i32
 // CHECK:           %[[VAL_13:.*]] = arith.cmpi eq, %[[VAL_12]], %[[VAL_9]] : i32
-// CHECK:           cond_br %[[VAL_13]], ^bb3, ^bb5
+// CHECK:           cf.cond_br %[[VAL_13]], ^bb3, ^bb5
 // CHECK:         ^bb3:
 // CHECK:           %[[VAL_14:.*]] = arith.constant 2 : index
 // CHECK:           %[[VAL_15:.*]] = iree_input.list.create %[[VAL_14]] : !iree_input.list<!iree_input.variant>
@@ -225,7 +225,7 @@
 // CHECK:           %[[VAL_19:.*]] = arith.constant -4 : i32
 // CHECK:           %[[VAL_20:.*]] = arith.constant 0 : i32
 // CHECK:           %[[VAL_21:.*]] = arith.constant false
-// CHECK:           br ^bb2(%[[VAL_19]], %[[VAL_20]], %[[VAL_21]] : i32, i32, i1)
+// CHECK:           cf.br ^bb2(%[[VAL_19]], %[[VAL_20]], %[[VAL_21]] : i32, i32, i1)
 // CHECK:         ^bb5:
 // CHECK:           %[[VAL_22:.*]] = iree_input.list.create : !iree_input.list<!iree_input.variant>
 // CHECK:           return %[[VAL_9]], %[[VAL_22]] : i32, !iree_input.list<!iree_input.variant>
@@ -247,23 +247,23 @@
 // CHECK:           %[[VAL_5:.*]] = arith.constant 1 : index
 // CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
 // CHECK:           %[[VAL_7:.*]] = arith.cmpi sle, %[[VAL_1]], %[[VAL_6]] : i32
-// CHECK:           %[[VAL_8:.*]] = select %[[VAL_7]], %[[VAL_4]], %[[VAL_3]] : index
+// CHECK:           %[[VAL_8:.*]] = arith.select %[[VAL_7]], %[[VAL_4]], %[[VAL_3]] : index
 // CHECK:           %[[VAL_9:.*]] = arith.muli %[[VAL_2]], %[[VAL_8]] : index
 // CHECK:           %[[VAL_10:.*]] = iree_input.list.create %[[VAL_8]] : !iree_input.list<!iree_input.variant>
 // CHECK:           iree_input.list.resize %[[VAL_10]], %[[VAL_9]] : !iree_input.list<!iree_input.variant>
-// CHECK:           br ^bb1(%[[VAL_4]] : index)
+// CHECK:           cf.br ^bb1(%[[VAL_4]] : index)
 // CHECK:         ^bb1(%[[VAL_11:.*]]: index):
 // CHECK:           %[[VAL_12:.*]] = arith.cmpi ult, %[[VAL_11]], %[[VAL_9]] : index
-// CHECK:           cond_br %[[VAL_12]], ^bb2(%[[VAL_11]], %[[VAL_4]] : index, index), ^bb4
+// CHECK:           cf.cond_br %[[VAL_12]], ^bb2(%[[VAL_11]], %[[VAL_4]] : index, index), ^bb4
 // CHECK:         ^bb2(%[[VAL_13:.*]]: index, %[[VAL_14:.*]]: index):
 // CHECK:           %[[VAL_15:.*]] = arith.cmpi ult, %[[VAL_14]], %[[VAL_2]] : index
-// CHECK:           cond_br %[[VAL_15]], ^bb3(%[[VAL_13]], %[[VAL_14]] : index, index), ^bb1(%[[VAL_13]] : index)
+// CHECK:           cf.cond_br %[[VAL_15]], ^bb3(%[[VAL_13]], %[[VAL_14]] : index, index), ^bb1(%[[VAL_13]] : index)
 // CHECK:         ^bb3(%[[VAL_16:.*]]: index, %[[VAL_17:.*]]: index):
 // CHECK:           %[[VAL_18:.*]] = iree_input.list.get %[[VAL_0]]{{\[}}%[[VAL_17]]] : !iree_input.list<!iree_input.variant> -> !iree_input.list<!iree_input.variant>
 // CHECK:           iree_input.list.set %[[VAL_10]]{{\[}}%[[VAL_16]]], %[[VAL_18]] : !iree_input.list<!iree_input.variant>, !iree_input.list<!iree_input.variant>
 // CHECK:           %[[VAL_19:.*]] = arith.addi %[[VAL_16]], %[[VAL_5]] : index
 // CHECK:           %[[VAL_20:.*]] = arith.addi %[[VAL_17]], %[[VAL_5]] : index
-// CHECK:           br ^bb2(%[[VAL_19]], %[[VAL_20]] : index, index)
+// CHECK:           cf.br ^bb2(%[[VAL_19]], %[[VAL_20]] : index, index)
 // CHECK:         ^bb4:
 // CHECK:           %[[VAL_21:.*]] = arith.constant 0 : i32
 // CHECK:           return %[[VAL_21]], %[[VAL_10]] : i32, !iree_input.list<!iree_input.variant>
@@ -282,29 +282,29 @@
 // CHECK:           %[[VAL_4:.*]] = arith.index_cast %[[VAL_3]] : index to i32
 // CHECK:           %[[VAL_5:.*]] = arith.cmpi slt, %[[VAL_1]], %[[VAL_2]] : i32
 // CHECK:           %[[VAL_6:.*]] = arith.index_cast %[[VAL_1]] : i32 to index
-// CHECK:           cond_br %[[VAL_5]], ^bb1, ^bb2(%[[VAL_6]] : index)
+// CHECK:           cf.cond_br %[[VAL_5]], ^bb1, ^bb2(%[[VAL_6]] : index)
 // CHECK:         ^bb1:
 // CHECK:           %[[VAL_7:.*]] = arith.addi %[[VAL_1]], %[[VAL_4]] : i32
 // CHECK:           %[[VAL_8:.*]] = arith.index_cast %[[VAL_7]] : i32 to index
-// CHECK:           br ^bb2(%[[VAL_8]] : index)
+// CHECK:           cf.br ^bb2(%[[VAL_8]] : index)
 // CHECK:         ^bb2(%[[VAL_9:.*]]: index):
 // CHECK:           %[[VAL_10:.*]] = arith.cmpi ult, %[[VAL_9]], %[[VAL_3]] : index
-// CHECK:           cond_br %[[VAL_10]], ^bb3(%[[VAL_9]] : index), ^bb6
+// CHECK:           cf.cond_br %[[VAL_10]], ^bb3(%[[VAL_9]] : index), ^bb6
 // CHECK:         ^bb3(%[[VAL_11:.*]]: index):
 // CHECK:           %[[VAL_12:.*]] = arith.constant 0 : i32
 // CHECK:           %[[VAL_13:.*]] = iree_input.list.get %[[VAL_0]]{{\[}}%[[VAL_11]]] : !iree_input.list<!iree_input.variant> -> !iree_input.list<!iree_input.variant>
-// CHECK:           br ^bb4(%[[VAL_12]], %[[VAL_13]] : i32, !iree_input.list<!iree_input.variant>)
+// CHECK:           cf.br ^bb4(%[[VAL_12]], %[[VAL_13]] : i32, !iree_input.list<!iree_input.variant>)
 // CHECK:         ^bb4(%[[VAL_14:.*]]: i32, %[[VAL_15:.*]]: !iree_input.list<!iree_input.variant>):
 // CHECK:           %[[VAL_16:.*]] = arith.constant 0 : i32
 // CHECK:           %[[VAL_17:.*]] = arith.cmpi eq, %[[VAL_16]], %[[VAL_14]] : i32
-// CHECK:           cond_br %[[VAL_17]], ^bb5, ^bb7
+// CHECK:           cf.cond_br %[[VAL_17]], ^bb5, ^bb7
 // CHECK:         ^bb5:
 // CHECK:           %[[VAL_18:.*]] = arith.constant 0 : i32
 // CHECK:           return %[[VAL_18]], %[[VAL_15]] : i32, !iree_input.list<!iree_input.variant>
 // CHECK:         ^bb6:
 // CHECK:           %[[VAL_19:.*]] = arith.constant -7 : i32
 // CHECK:           %[[VAL_20:.*]] = iree_input.list.create : !iree_input.list<!iree_input.variant>
-// CHECK:           br ^bb4(%[[VAL_19]], %[[VAL_20]] : i32, !iree_input.list<!iree_input.variant>)
+// CHECK:           cf.br ^bb4(%[[VAL_19]], %[[VAL_20]] : i32, !iree_input.list<!iree_input.variant>)
 // CHECK:         ^bb7:
 // CHECK:           %[[VAL_21:.*]] = iree_input.list.create : !iree_input.list<!iree_input.variant>
 // CHECK:           return %[[VAL_14]], %[[VAL_21]] : i32, !iree_input.list<!iree_input.variant>
@@ -325,24 +325,24 @@
 // CHECK:           %[[VAL_5:.*]] = arith.index_cast %[[VAL_4]] : index to i32
 // CHECK:           %[[VAL_6:.*]] = arith.cmpi slt, %[[VAL_1]], %[[VAL_3]] : i32
 // CHECK:           %[[VAL_7:.*]] = arith.index_cast %[[VAL_1]] : i32 to index
-// CHECK:           cond_br %[[VAL_6]], ^bb1, ^bb2(%[[VAL_7]] : index)
+// CHECK:           cf.cond_br %[[VAL_6]], ^bb1, ^bb2(%[[VAL_7]] : index)
 // CHECK:         ^bb1:
 // CHECK:           %[[VAL_8:.*]] = arith.addi %[[VAL_1]], %[[VAL_5]] : i32
 // CHECK:           %[[VAL_9:.*]] = arith.index_cast %[[VAL_8]] : i32 to index
-// CHECK:           br ^bb2(%[[VAL_9]] : index)
+// CHECK:           cf.br ^bb2(%[[VAL_9]] : index)
 // CHECK:         ^bb2(%[[VAL_10:.*]]: index):
 // CHECK:           %[[VAL_11:.*]] = arith.cmpi ult, %[[VAL_10]], %[[VAL_4]] : index
-// CHECK:           cond_br %[[VAL_11]], ^bb3(%[[VAL_10]] : index), ^bb5
+// CHECK:           cf.cond_br %[[VAL_11]], ^bb3(%[[VAL_10]] : index), ^bb5
 // CHECK:         ^bb3(%[[VAL_12:.*]]: index):
 // CHECK:           %[[VAL_13:.*]] = arith.constant 0 : i32
 // CHECK:           iree_input.list.set %[[VAL_0]]{{\[}}%[[VAL_12]]], %[[VAL_2]] : !iree_input.list<!iree_input.variant>, !iree_input.list<!iree_input.variant>
-// CHECK:           br ^bb4(%[[VAL_13]] : i32)
+// CHECK:           cf.br ^bb4(%[[VAL_13]] : i32)
 // CHECK:         ^bb4(%[[VAL_14:.*]]: i32):
 // CHECK:           %[[VAL_15:.*]] = arith.constant 0 : i32
 // CHECK:           return %[[VAL_15]], %[[VAL_0]] : i32, !iree_input.list<!iree_input.variant>
 // CHECK:         ^bb5:
 // CHECK:           %[[VAL_16:.*]] = arith.constant -7 : i32
-// CHECK:           br ^bb4(%[[VAL_16]] : i32)
+// CHECK:           cf.br ^bb4(%[[VAL_16]] : i32)
 // CHECK:         }
 iree_pydm.func @assign_subscript_list(%arg0 : !iree_pydm.list, %arg1 : !iree_pydm.integer, %arg2 : !iree_pydm.object) -> (!iree_pydm.exception_result, !iree_pydm.list) {
   assign_subscript %arg0[%arg1] = %arg2 : !iree_pydm.list, !iree_pydm.integer, !iree_pydm.object
diff --git a/integrations/tensorflow/iree-dialects/test/python/iree_pydm/importer/flow_control.py b/integrations/tensorflow/iree-dialects/test/python/iree_pydm/importer/flow_control.py
index afa1eef..eaf2579 100644
--- a/integrations/tensorflow/iree-dialects/test/python/iree_pydm/importer/flow_control.py
+++ b/integrations/tensorflow/iree-dialects/test/python/iree_pydm/importer/flow_control.py
@@ -8,7 +8,7 @@
 # CHECK: %[[COND:.*]] = load_var %cond
 # CHECK: %[[COND_BOOL:.*]] = as_bool %[[COND]]
 # CHECK: %[[COND_PRED:.*]] = bool_to_pred %[[COND_BOOL]]
-# CHECK: cond_br %2, ^bb1, ^bb2
+# CHECK: cf.cond_br %2, ^bb1, ^bb2
 # CHECK: ^bb1:
 # CHECK: %[[A:.*]] = load_var %a
 # CHECK: return %[[A]]
@@ -24,7 +24,7 @@
 
 
 # CHECK-LABEL: @if_fallthrough
-# CHECK: cond_br {{.*}}, ^bb1, ^bb2
+# CHECK: cf.cond_br {{.*}}, ^bb1, ^bb2
 # CHECK: ^bb1:
 # CHECK: br ^bb3
 # CHECK: ^bb2:
@@ -41,7 +41,7 @@
 
 
 # CHECK-LABEL: @if_noelse
-# CHECK: cond_br {{.*}}, ^bb1, ^bb2
+# CHECK: cf.cond_br {{.*}}, ^bb1, ^bb2
 # CHECK: ^bb1:
 # CHECK: br ^bb2
 # CHECK: ^bb2:
@@ -55,11 +55,11 @@
 
 
 # CHECK-LABEL: @if_elif
-# CHECK: cond_br {{.*}}, ^bb1, ^bb2
+# CHECK: cf.cond_br {{.*}}, ^bb1, ^bb2
 # CHECK: ^bb1:
 # CHECK: br ^bb6
 # CHECK: ^bb2:
-# CHECK: cond_br {{.*}}, ^bb3, ^bb4
+# CHECK: cf.cond_br {{.*}}, ^bb3, ^bb4
 # CHECK: ^bb3:
 # CHECK: br ^bb5
 # CHECK: ^bb4:
@@ -80,15 +80,15 @@
 
 
 # CHECK-LABEL: @simple_while
-# CHECK: std.br ^bb1
+# CHECK: cf.br ^bb1
 # CHECK: ^bb1:  // 2 preds: ^bb0, ^bb2
 # CHECK:   %[[COND:.*]] = load_var %cond
 # CHECK:   %[[COND_BOOL:.*]] = as_bool %[[COND]]
 # CHECK:   %[[COND_PRED:.*]] = bool_to_pred %[[COND_BOOL]]
-# CHECL:   std.cond_br %2, ^bb2, ^bb3
+# CHECL:   cf.cond_br %2, ^bb2, ^bb3
 # CHECK: ^bb2:  // pred: ^bb1
 # CHECK:   store_var %a
-# CHECK:   std.br ^bb1
+# CHECK:   cf.br ^bb1
 # CHECK: ^bb3:  // pred: ^bb1
 # CHECK:   load_var %a
 @test_import_global
@@ -102,7 +102,7 @@
 # CHECK: ^bb1:  // 2 preds: ^bb0, ^bb4
 # CHECK: ^bb2:  // pred: ^bb1
 # CHECK: ^bb3:  // pred: ^bb2
-# CHECK:   std.br ^bb5
+# CHECK:   cf.br ^bb5
 # CHECK: ^bb4:  // pred: ^bb2
 # CHECK: ^bb5:  // 2 preds: ^bb1, ^bb3
 # CHECK:   load_var %a
@@ -120,7 +120,7 @@
 # CHECK: ^bb1:  // 3 preds: ^bb0, ^bb3, ^bb4
 # CHECK: ^bb2:  // pred: ^bb1
 # CHECK: ^bb3:  // pred: ^bb2
-# CHECK:   std.br ^bb1
+# CHECK:   cf.br ^bb1
 # CHECK: ^bb4:  // pred: ^bb2
 # CHECK: ^bb5:  // pred: ^bb1
 # CHECK:   load_var %a
@@ -138,7 +138,7 @@
 # CHECK: ^bb1:  // 2 preds: ^bb0, ^bb4
 # CHECK: ^bb2:  // pred: ^bb1
 # CHECK: ^bb3:  // pred: ^bb2
-# CHECK:   std.br ^bb6
+# CHECK:   cf.br ^bb6
 # CHECK: ^bb4:  // pred: ^bb2
 # CHECK: ^bb5:  // pred: ^bb1
 # CHECK:   store_var %c
diff --git a/integrations/tensorflow/iree-dialects/test/python/smoketest.py b/integrations/tensorflow/iree-dialects/test/python/smoketest.py
index cef2e6c..6804fec 100644
--- a/integrations/tensorflow/iree-dialects/test/python/smoketest.py
+++ b/integrations/tensorflow/iree-dialects/test/python/smoketest.py
@@ -2,10 +2,12 @@
 
 import iree.compiler.ir
 from iree.compiler.dialects import iree_input as iree_d
+from iree.compiler.dialects import iree_linalg_ext
 from iree.compiler.dialects import iree_pydm as pydm_d
 
 with iree.compiler.ir.Context() as ctx:
   iree_d.register_dialect()
+  iree_linalg_ext.register_dialect()
   pydm_d.register_dialect()
 
   # iree_pydm types.
diff --git a/integrations/tensorflow/iree-dialects/tools/iree-dialects-opt/CMakeLists.txt b/integrations/tensorflow/iree-dialects/tools/iree-dialects-opt/CMakeLists.txt
index 473ad48..6aecef3 100644
--- a/integrations/tensorflow/iree-dialects/tools/iree-dialects-opt/CMakeLists.txt
+++ b/integrations/tensorflow/iree-dialects/tools/iree-dialects-opt/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(LIBS
   MLIRArithmetic
+  MLIRControlFlow
   MLIRDialect
   MLIRLinalg
   MLIRMemRef
@@ -12,6 +13,7 @@
   IREEInputDialect
   IREELinalgExtDialect
   IREELinalgExtPasses
+  IREELinalgExtTransforms
   IREEPyDMDialect
   IREEPyDMPasses
 )
diff --git a/integrations/tensorflow/iree-dialects/tools/iree-dialects-opt/iree-dialects-opt.cpp b/integrations/tensorflow/iree-dialects/tools/iree-dialects-opt/iree-dialects-opt.cpp
index 81f513b..d1e844b 100644
--- a/integrations/tensorflow/iree-dialects/tools/iree-dialects-opt/iree-dialects-opt.cpp
+++ b/integrations/tensorflow/iree-dialects/tools/iree-dialects-opt/iree-dialects-opt.cpp
@@ -7,10 +7,11 @@
 #include "iree-dialects/Dialect/Input/InputDialect.h"
 #include "iree-dialects/Dialect/LinalgExt/IR/LinalgExtDialect.h"
 #include "iree-dialects/Dialect/LinalgExt/IR/TiledOpInterface.h"
-#include "iree-dialects/Dialect/LinalgExt/Transforms/Passes.h"
+#include "iree-dialects/Dialect/LinalgExt/Passes/Passes.h"
 #include "iree-dialects/Dialect/PyDM/IR/PyDMDialect.h"
 #include "iree-dialects/Dialect/PyDM/Transforms/Passes.h"
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
@@ -43,9 +44,10 @@
       mlir::iree_compiler::IREE::LinalgExt::IREELinalgExtDialect,
       mlir::iree_compiler::IREE::PYDM::IREEPyDMDialect,
       // Upstream dialects
-      mlir::arith::ArithmeticDialect, mlir::linalg::LinalgDialect,
-      mlir::memref::MemRefDialect, mlir::func::FuncDialect,
-      mlir::scf::SCFDialect, mlir::tensor::TensorDialect>();
+      mlir::arith::ArithmeticDialect, mlir::cf::ControlFlowDialect,
+      mlir::linalg::LinalgDialect, mlir::memref::MemRefDialect,
+      mlir::func::FuncDialect, mlir::scf::SCFDialect,
+      mlir::tensor::TensorDialect>();
 
   IREE::LinalgExt::registerTiledOpInterfaceExternalModels(registry);
 
diff --git a/iree/compiler/Codegen/Common/BUILD b/iree/compiler/Codegen/Common/BUILD
index 80649f3..3851b8b 100644
--- a/iree/compiler/Codegen/Common/BUILD
+++ b/iree/compiler/Codegen/Common/BUILD
@@ -74,7 +74,6 @@
         "//llvm-external-projects/iree-dialects:IREELinalgExtPasses",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Affine",
-        "@llvm-project//mlir:AffineBufferizableOpInterfaceImpl",
         "@llvm-project//mlir:AffineUtils",
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:ArithmeticDialect",
diff --git a/iree/compiler/Codegen/Common/CMakeLists.txt b/iree/compiler/Codegen/Common/CMakeLists.txt
index 90fd4ef..f53a9da 100644
--- a/iree/compiler/Codegen/Common/CMakeLists.txt
+++ b/iree/compiler/Codegen/Common/CMakeLists.txt
@@ -51,7 +51,6 @@
     IREELinalgExtPasses
     LLVMSupport
     MLIRAffine
-    MLIRAffineBufferizableOpInterfaceImpl
     MLIRAffineUtils
     MLIRAnalysis
     MLIRArithmetic
diff --git a/iree/compiler/Codegen/Common/FoldTensorExtractOp.td b/iree/compiler/Codegen/Common/FoldTensorExtractOp.td
index 98e86f6..fba92ac 100644
--- a/iree/compiler/Codegen/Common/FoldTensorExtractOp.td
+++ b/iree/compiler/Codegen/Common/FoldTensorExtractOp.td
@@ -10,6 +10,7 @@
 include "mlir/Dialect/Bufferization/IR/BufferizationOps.td"
 include "mlir/Dialect/MemRef/IR/MemRefOps.td"
 include "mlir/Dialect/Tensor/IR/TensorOps.td"
+include "mlir/IR/PatternBase.td"
 
 // Canonicalize unnecessary tensor_load when the load is used just for
 // an extract
diff --git a/iree/compiler/Codegen/Common/IREEComprehensiveBufferizePass.cpp b/iree/compiler/Codegen/Common/IREEComprehensiveBufferizePass.cpp
index aa2c047..6858248 100644
--- a/iree/compiler/Codegen/Common/IREEComprehensiveBufferizePass.cpp
+++ b/iree/compiler/Codegen/Common/IREEComprehensiveBufferizePass.cpp
@@ -54,8 +54,8 @@
 
 #define DEBUG_TYPE "iree-codegen-linalg-bufferize"
 
-using mlir::bufferization::AnalysisBufferizationOptions;
 using mlir::bufferization::BufferizationOptions;
+using mlir::bufferization::OneShotBufferizationOptions;
 
 namespace mlir {
 namespace iree_compiler {
@@ -97,7 +97,7 @@
 /// Run comprehensive bufferize.
 void IREEComprehensiveBufferizePass::runOnOperation() {
   ModuleOp moduleOp = getOperation();
-  AnalysisBufferizationOptions options;
+  OneShotBufferizationOptions options;
   options.allocationFn = allocationFn;
   options.deallocationFn = deallocationFn;
   options.memCpyFn = memCpyFn;
diff --git a/iree/compiler/Codegen/Common/test/convert_to_destination_passing_style.mlir b/iree/compiler/Codegen/Common/test/convert_to_destination_passing_style.mlir
index 0907d4c..19cafde 100644
--- a/iree/compiler/Codegen/Common/test/convert_to_destination_passing_style.mlir
+++ b/iree/compiler/Codegen/Common/test/convert_to_destination_passing_style.mlir
@@ -466,10 +466,9 @@
 //   CHECK-DAG:   %[[OUT:.+]] = hal.interface.binding.subspan set(0) binding(2)
 //   CHECK-DAG:   %[[IN_VIEW:.+]] = flow.dispatch.tensor.load %[[IN]]
 //  CHECK-DAG:    %[[OUT_VIEW:.+]] = flow.dispatch.tensor.load %[[OUT]]
-//   CHECK-DAG:   %[[INIT:.+]] = linalg.init_tensor
 //       CHECK:   linalg.generic
-//  CHECK-SAME:     ins(%[[IN_VIEW]], %[[INIT]]
-//  CHECK-SAME:     outs(%[[OUT_VIEW]]
+//  CHECK-SAME:     ins(%[[IN_VIEW]] :
+//  CHECK-SAME:     outs(%[[OUT_VIEW]] :
 
 // -----
 
diff --git a/iree/compiler/Codegen/Interfaces/BUILD b/iree/compiler/Codegen/Interfaces/BUILD
index ef69d02..3d168b6 100644
--- a/iree/compiler/Codegen/Interfaces/BUILD
+++ b/iree/compiler/Codegen/Interfaces/BUILD
@@ -55,7 +55,6 @@
     deps = [
         "//iree/compiler/Dialect/Flow/IR",
         "//iree/compiler/Dialect/HAL/IR",
-        "@llvm-project//mlir:AffineBufferizableOpInterfaceImpl",
         "@llvm-project//mlir:ArithmeticTransforms",
         "@llvm-project//mlir:BufferizationDialect",
         "@llvm-project//mlir:BufferizationTransforms",
diff --git a/iree/compiler/Codegen/Interfaces/BufferizationInterfaces.cpp b/iree/compiler/Codegen/Interfaces/BufferizationInterfaces.cpp
index e50ff3e..9302dd1 100644
--- a/iree/compiler/Codegen/Interfaces/BufferizationInterfaces.cpp
+++ b/iree/compiler/Codegen/Interfaces/BufferizationInterfaces.cpp
@@ -14,7 +14,6 @@
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h"
-#include "mlir/Dialect/Linalg/ComprehensiveBufferize/AffineInterfaceImpl.h"
 #include "mlir/Dialect/Linalg/ComprehensiveBufferize/ModuleBufferization.h"
 #include "mlir/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
@@ -23,13 +22,13 @@
 #include "mlir/Dialect/Vector/Transforms/BufferizableOpInterfaceImpl.h"
 #include "mlir/Support/LLVM.h"
 
-using mlir::bufferization::AnalysisBufferizationOptions;
-using mlir::bufferization::AnalysisBufferizationState;
+using mlir::bufferization::AnalysisState;
 using mlir::bufferization::BufferizableOpInterface;
 using mlir::bufferization::BufferizationAliasInfo;
 using mlir::bufferization::BufferizationState;
 using mlir::bufferization::createMemCpy;
-using mlir::bufferization::DialectBufferizationState;
+using mlir::bufferization::DialectAnalysisState;
+using mlir::bufferization::OneShotBufferizationOptions;
 using mlir::bufferization::PostAnalysisStepFn;
 using mlir::bufferization::replaceOpWithNewBufferizedOp;
 using mlir::linalg::eliminateInitTensors;
@@ -43,7 +42,7 @@
 
 namespace {
 /// Flow dialect-specific bufferization state.
-struct FlowBufferizationState : public DialectBufferizationState {
+struct FlowBufferizationState : public DialectAnalysisState {
   DenseMap<Value, Value> subspan_to_buffer;
 
   /// DispatchTensorStoreOps that do not require a copy.
@@ -53,17 +52,14 @@
 
 /// Get FlowBufferizationState.
 static const FlowBufferizationState &getFlowBufferizationState(
-    const BufferizationState &state) {
+    const AnalysisState &state) {
   Optional<const FlowBufferizationState *> maybeState =
       state.getDialectState<FlowBufferizationState>(
           IREE::Flow::FlowDialect::getDialectNamespace());
   assert(maybeState.hasValue() && "FlowBufferizationState does not exist");
   return **maybeState;
 }
-
-/// Get or create FlowBufferizationState.
-static FlowBufferizationState &getFlowBufferizationState(
-    BufferizationState &state) {
+static FlowBufferizationState &getFlowBufferizationState(AnalysisState &state) {
   return state.getOrCreateDialectState<FlowBufferizationState>(
       IREE::Flow::FlowDialect::getDialectNamespace());
 }
@@ -77,7 +73,7 @@
 }
 
 static Value getSubspanBuffer(Value tensor, RewriterBase &rewriter,
-                              const BufferizationState &state) {
+                              const AnalysisState &state) {
   const FlowBufferizationState &flowState = getFlowBufferizationState(state);
   auto it = flowState.subspan_to_buffer.find(tensor);
   assert(it != flowState.subspan_to_buffer.end() && "subspan buffer not found");
@@ -94,7 +90,7 @@
     : public BufferizableOpInterface::ExternalModel<
           DispatchTensorLoadOpInterface, IREE::Flow::DispatchTensorLoadOp> {
   bool isWritable(Operation *op, Value value,
-                  const BufferizationState &state) const {
+                  const AnalysisState &state) const {
     auto loadOp = cast<IREE::Flow::DispatchTensorLoadOp>(op);
     auto shapedType =
         loadOp.source().getType().dyn_cast<IREE::Flow::DispatchTensorType>();
@@ -103,9 +99,10 @@
   }
 
   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                          const BufferizationState &state) const {
+                          BufferizationState &state) const {
     auto loadOp = cast<IREE::Flow::DispatchTensorLoadOp>(op);
-    Value source = getSubspanBuffer(loadOp.source(), rewriter, state);
+    Value source =
+        getSubspanBuffer(loadOp.source(), rewriter, state.getAnalysisState());
 
     // Bufferize to subview.
     replaceOpWithNewBufferizedOp<memref::SubViewOp>(
@@ -120,26 +117,26 @@
     : public BufferizableOpInterface::ExternalModel<
           DispatchTensorStoreOpInterface, IREE::Flow::DispatchTensorStoreOp> {
   bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
-                              const BufferizationState &state) const {
+                              const AnalysisState &state) const {
     return true;
   }
 
   bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
-                               const BufferizationState &state) const {
+                               const AnalysisState &state) const {
     return false;
   }
 
-  SmallVector<OpResult> getAliasingOpResult(
-      Operation *op, OpOperand &opOperand,
-      const BufferizationState &state) const {
+  SmallVector<OpResult> getAliasingOpResult(Operation *op, OpOperand &opOperand,
+                                            const AnalysisState &state) const {
     return {};
   }
 
   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                          const BufferizationState &state) const {
+                          BufferizationState &state) const {
     auto storeOp = cast<IREE::Flow::DispatchTensorStoreOp>(op);
 
-    Value target = getSubspanBuffer(storeOp.target(), rewriter, state);
+    const AnalysisState &analysisState = state.getAnalysisState();
+    Value target = getSubspanBuffer(storeOp.target(), rewriter, analysisState);
     Value subView = rewriter.create<memref::SubViewOp>(
         storeOp->getLoc(), target, storeOp.getMixedOffsets(),
         storeOp.getMixedSizes(), storeOp.getMixedStrides());
@@ -183,9 +180,9 @@
 }
 
 static LogicalResult inplaceTensorStoreOpAnalysis(
-    Operation *op, BufferizationState &state, BufferizationAliasInfo &aliasInfo,
+    Operation *op, AnalysisState &state, BufferizationAliasInfo &aliasInfo,
     SmallVector<Operation *> &newOps) {
-  auto &flowState = getFlowBufferizationState(state);
+  FlowBufferizationState &flowState = getFlowBufferizationState(state);
   op->walk([&](IREE::Flow::DispatchTensorStoreOp storeOp) {
     // If a store op's dest is eqivalent to a load op's source, no copy is
     // needed for the store op. All writes already happened inplace.
@@ -203,7 +200,7 @@
 /// * All ops along the reverse SSA use-def chain from the
 ///   DispatchTensorStoreOp to the InitTensorOp must have bufferized in-place.
 static LogicalResult storeTensorOpAnchoredInitTensorEliminationStep(
-    Operation *op, BufferizationState &state, BufferizationAliasInfo &aliasInfo,
+    Operation *op, AnalysisState &state, BufferizationAliasInfo &aliasInfo,
     SmallVector<Operation *> &newOps) {
   return eliminateInitTensors(
       op, state, aliasInfo,
@@ -224,8 +221,7 @@
       newOps);
 }
 
-static LogicalResult createSubSpanBuffers(Operation *op,
-                                          BufferizationState &state,
+static LogicalResult createSubSpanBuffers(Operation *op, AnalysisState &state,
                                           BufferizationAliasInfo &aliasInfo,
                                           SmallVector<Operation *> &newOps) {
   FlowBufferizationState &flowState = getFlowBufferizationState(state);
@@ -274,8 +270,6 @@
 }
 
 void registerBufferizationInterfaces(DialectRegistry &registry) {
-  linalg::comprehensive_bufferize::affine_ext::
-      registerBufferizableOpInterfaceExternalModels(registry);
   arith::registerBufferizableOpInterfaceExternalModels(registry);
   linalg::registerBufferizableOpInterfaceExternalModels(registry);
   scf::registerBufferizableOpInterfaceExternalModels(registry);
@@ -291,7 +285,7 @@
                           DispatchTensorStoreOpInterface>();
 }
 
-void addPostAnalysisTransformations(AnalysisBufferizationOptions &options) {
+void addPostAnalysisTransformations(OneShotBufferizationOptions &options) {
   options.addPostAnalysisStep(createSubSpanBuffers);
   options.addPostAnalysisStep(storeTensorOpAnchoredInitTensorEliminationStep);
   options.addPostAnalysisStep(inplaceTensorStoreOpAnalysis);
diff --git a/iree/compiler/Codegen/Interfaces/BufferizationInterfaces.h b/iree/compiler/Codegen/Interfaces/BufferizationInterfaces.h
index f5d8b2d..c65a1d7 100644
--- a/iree/compiler/Codegen/Interfaces/BufferizationInterfaces.h
+++ b/iree/compiler/Codegen/Interfaces/BufferizationInterfaces.h
@@ -19,7 +19,7 @@
 
 // Method to add all the analysis passes for bufferization.
 void addPostAnalysisTransformations(
-    bufferization::AnalysisBufferizationOptions &options);
+    bufferization::OneShotBufferizationOptions &options);
 
 }  // namespace iree_compiler
 }  // namespace mlir
diff --git a/iree/compiler/Codegen/Interfaces/CMakeLists.txt b/iree/compiler/Codegen/Interfaces/CMakeLists.txt
index ea969f2..fe86298 100644
--- a/iree/compiler/Codegen/Interfaces/CMakeLists.txt
+++ b/iree/compiler/Codegen/Interfaces/CMakeLists.txt
@@ -31,7 +31,6 @@
   SRCS
     "BufferizationInterfaces.cpp"
   DEPS
-    MLIRAffineBufferizableOpInterfaceImpl
     MLIRArithmeticTransforms
     MLIRBufferization
     MLIRBufferizationTransforms
diff --git a/iree/compiler/Dialect/Flow/IR/FlowBase.td b/iree/compiler/Dialect/Flow/IR/FlowBase.td
index b9da82c..120b77d 100644
--- a/iree/compiler/Dialect/Flow/IR/FlowBase.td
+++ b/iree/compiler/Dialect/Flow/IR/FlowBase.td
@@ -9,6 +9,7 @@
 
 include "iree/compiler/Dialect/Flow/IR/FlowInterfaces.td"
 include "iree/compiler/Dialect/Util/IR/UtilBase.td"
+include "mlir/IR/AttrTypeBase.td"
 
 //===----------------------------------------------------------------------===//
 // IREE execution flow dialect
diff --git a/iree/compiler/Dialect/Flow/IR/FlowOps.cpp b/iree/compiler/Dialect/Flow/IR/FlowOps.cpp
index aa6b96a..be79e19 100644
--- a/iree/compiler/Dialect/Flow/IR/FlowOps.cpp
+++ b/iree/compiler/Dialect/Flow/IR/FlowOps.cpp
@@ -64,8 +64,9 @@
 // flow.dispatch.tie_shape
 //===----------------------------------------------------------------------===//
 
-static LogicalResult verifyDispatchTieShapeOp(DispatchTieShapeOp op) {
-  if (failed(verifyOpDynamicDims(op, {op.operand()}, op.dynamic_dims()))) {
+LogicalResult DispatchTieShapeOp::verify() {
+  if (failed(
+          verifyOpDynamicDims(getOperation(), {operand()}, dynamic_dims()))) {
     return failure();
   }
   return success();
@@ -91,8 +92,8 @@
 // flow.dispatch.tensor.load
 //===----------------------------------------------------------------------===//
 
-static LogicalResult verifyDispatchTensorLoadOp(DispatchTensorLoadOp op) {
-  if (failed(verifyOpDynamicDims(op, {op.source()}, op.source_dims()))) {
+LogicalResult DispatchTensorLoadOp::verify() {
+  if (failed(verifyOpDynamicDims(getOperation(), {source()}, source_dims()))) {
     return failure();
   }
   return success();
@@ -271,8 +272,8 @@
 // flow.dispatch.tensor.store
 //===----------------------------------------------------------------------===//
 
-static LogicalResult verifyDispatchTensorStoreOp(DispatchTensorStoreOp op) {
-  if (failed(verifyOpDynamicDims(op, {op.target()}, op.target_dims()))) {
+LogicalResult DispatchTensorStoreOp::verify() {
+  if (failed(verifyOpDynamicDims(getOperation(), {target()}, target_dims()))) {
     return failure();
   }
   return success();
@@ -425,30 +426,31 @@
                 /*printBlockTerminators=*/true);
 }
 
-static LogicalResult verifyDispatchWorkgroupsOp(DispatchWorkgroupsOp op) {
-  if (op.workgroup_count().empty()) {
-    return op.emitOpError() << "at least one workgroup dimension is required";
+LogicalResult DispatchWorkgroupsOp::verify() {
+  Operation *op = getOperation();
+  if (workgroup_count().empty()) {
+    return op->emitOpError() << "at least one workgroup dimension is required";
   }
 
-  if (failed(verifyOpDynamicDims(op, op.operands(), op.operand_dims())) ||
-      failed(verifyOpDynamicDims(op, op.results(), op.result_dims()))) {
+  if (failed(verifyOpDynamicDims(getOperation(), operands(), operand_dims())) ||
+      failed(verifyOpDynamicDims(getOperation(), results(), result_dims()))) {
     return failure();
   }
 
   auto verifyIOType = [&](Type type) -> LogicalResult {
     if (auto shapedType = type.dyn_cast<ShapedType>()) {
       if (shapedType.getElementType().isIndex()) {
-        return op.emitOpError() << "I/O type " << type
-                                << " is invalid: index types must not cross "
-                                   "the dispatch boundary";
+        return op->emitOpError() << "I/O type " << type
+                                 << " is invalid: index types must not cross "
+                                    "the dispatch boundary";
       }
     }
     return success();
   };
-  for (auto type : op.getOperandTypes()) {
+  for (auto type : getOperandTypes()) {
     if (failed(verifyIOType(type))) return failure();
   }
-  for (auto type : op.getResultTypes()) {
+  for (auto type : getResultTypes()) {
     if (failed(verifyIOType(type))) return failure();
   }
 
@@ -644,15 +646,13 @@
                                               result(), setNameFn);
 }
 
-template <typename T>
-static LogicalResult verifyDispatchWorkgroupInfoOp(T op) {
+LogicalResult verifyDispatchWorkgroupInfoOp(Operation *op, uint64_t dimension) {
   size_t dimCount = 0;
-  if (auto dispatchOp = op->template getParentOfType<DispatchWorkgroupsOp>()) {
+  if (auto dispatchOp = op->getParentOfType<DispatchWorkgroupsOp>()) {
     dimCount = dispatchOp.workgroup_count().size();
   }
-  uint64_t dimension = op.dimension().getZExtValue();
   if (dimCount != 0 && (dimension < 0 || dimension >= dimCount)) {
-    return op.emitOpError()
+    return op->emitOpError()
            << "dimension " << dimension
            << " out of bounds of dispatch dimensions; expected [0, "
            << (dimCount - 1) << ")";
@@ -671,7 +671,7 @@
                      builder.getStringAttr(name));
 }
 
-static LogicalResult verifyExecutableOp(ExecutableOp op) {
+LogicalResult ExecutableOp::verify() {
   // TODO(benvanik): check export name conflicts.
   return success();
 }
@@ -778,12 +778,13 @@
   return FunctionType::get(getContext(), argTypes, getResultTypes());
 }
 
-static LogicalResult verifyDispatchOp(DispatchOp op) {
-  if (op.workgroup_count().empty()) {
-    return op.emitOpError() << "at least one workgroup dimension is required";
+LogicalResult DispatchOp::verify() {
+  Operation *op = getOperation();
+  if (workgroup_count().empty()) {
+    return op->emitOpError() << "at least one workgroup dimension is required";
   }
-  if (failed(verifyOpDynamicDims(op, op.operands(), op.operand_dims())) ||
-      failed(verifyOpDynamicDims(op, op.results(), op.result_dims()))) {
+  if (failed(verifyOpDynamicDims(op, operands(), operand_dims())) ||
+      failed(verifyOpDynamicDims(op, results(), result_dims()))) {
     return failure();
   }
   return success();
@@ -797,8 +798,9 @@
 // flow.tensor.tie_shape
 //===----------------------------------------------------------------------===//
 
-static LogicalResult verifyTensorTieShapeOp(TensorTieShapeOp op) {
-  if (failed(verifyOpDynamicDims(op, {op.operand()}, op.dynamic_dims()))) {
+LogicalResult TensorTieShapeOp::verify() {
+  if (failed(
+          verifyOpDynamicDims(getOperation(), {operand()}, dynamic_dims()))) {
     return failure();
   }
   return success();
@@ -852,9 +854,9 @@
         update, updateDims, builder.getIndexArrayAttr({0}));
 }
 
-static LogicalResult verifyTensorUpdateOp(TensorUpdateOp op) {
-  if (failed(verifyOpDynamicDims(op, {op.update()}, op.update_dims())) ||
-      failed(verifyOpDynamicDims(op, {op.target()}, op.target_dims()))) {
+LogicalResult TensorUpdateOp::verify() {
+  if (failed(verifyOpDynamicDims(getOperation(), {update()}, update_dims())) ||
+      failed(verifyOpDynamicDims(getOperation(), {target()}, target_dims()))) {
     return failure();
   }
   return success();
diff --git a/iree/compiler/Dialect/Flow/IR/FlowOps.h b/iree/compiler/Dialect/Flow/IR/FlowOps.h
index 47381de..4db195e 100644
--- a/iree/compiler/Dialect/Flow/IR/FlowOps.h
+++ b/iree/compiler/Dialect/Flow/IR/FlowOps.h
@@ -25,9 +25,6 @@
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Interfaces/ViewLikeInterface.h"
 
-#define GET_OP_CLASSES
-#include "iree/compiler/Dialect/Flow/IR/FlowOps.h.inc"  // IWYU pragma: export
-
 namespace mlir {
 namespace iree_compiler {
 namespace IREE {
@@ -37,9 +34,15 @@
 void populateFlowDispatchCanonicalizationPatterns(
     ::mlir::RewritePatternSet &results, ::mlir::MLIRContext *context);
 
+// Verifies the flow.dispatch.workgroup.size/id/count operations.
+LogicalResult verifyDispatchWorkgroupInfoOp(Operation *op, uint64_t dimension);
+
 }  // namespace Flow
 }  // namespace IREE
 }  // namespace iree_compiler
 }  // namespace mlir
 
+#define GET_OP_CLASSES
+#include "iree/compiler/Dialect/Flow/IR/FlowOps.h.inc"  // IWYU pragma: export
+
 #endif  // IREE_COMPILER_DIALECT_FLOW_IR_FLOWOPS_H_
diff --git a/iree/compiler/Dialect/Flow/IR/FlowOps.td b/iree/compiler/Dialect/Flow/IR/FlowOps.td
index 19c8559..8bde738 100644
--- a/iree/compiler/Dialect/Flow/IR/FlowOps.td
+++ b/iree/compiler/Dialect/Flow/IR/FlowOps.td
@@ -129,8 +129,7 @@
     }
   }];
 
-  let verifier = [{ return verifyDispatchWorkgroupsOp(*this); }];
-
+  let hasVerifier = 1;
   let hasCanonicalizer = 1;
 }
 
@@ -184,7 +183,11 @@
   ];
   let assemblyFormat = "`[` $dimension `]` attr-dict `:` type($result)";
 
-  let verifier = [{ return verifyDispatchWorkgroupInfoOp(*this); }];
+  let extraClassDeclaration = [{
+    LogicalResult verify() {
+      return verifyDispatchWorkgroupInfoOp(getOperation(), dimension().getZExtValue());
+    }
+  }];
 }
 
 def FLOW_DispatchWorkgroupCountOp : FLOW_PureOp<"dispatch.workgroup.count", [
@@ -215,7 +218,11 @@
   ];
   let assemblyFormat = "`[` $dimension `]` attr-dict `:` type($result)";
 
-  let verifier = [{ return verifyDispatchWorkgroupInfoOp(*this); }];
+  let extraClassDeclaration = [{
+    LogicalResult verify() {
+      return verifyDispatchWorkgroupInfoOp(getOperation(), dimension().getZExtValue());
+    }
+  }];
 }
 
 def FLOW_DispatchWorkgroupSizeOp : FLOW_PureOp<"dispatch.workgroup.size", [
@@ -254,7 +261,11 @@
 
   let assemblyFormat = "`[` $dimension `]` attr-dict `:` type($result)";
 
-  let verifier = [{ return verifyDispatchWorkgroupInfoOp(*this); }];
+  let extraClassDeclaration = [{
+    LogicalResult verify() {
+      return verifyDispatchWorkgroupInfoOp(getOperation(), dimension().getZExtValue());
+    }
+  }];
 }
 
 def FLOW_DispatchTieShapeOp : FLOW_PureOp<"dispatch.tie_shape", [
@@ -287,7 +298,7 @@
     ValueRange getResultDynamicDims(unsigned idx) { return dynamic_dims(); }
   }];
 
-  let verifier = [{ return verifyDispatchTieShapeOp(*this); }];
+  let hasVerifier = 1;
 
   let hasFolder = 1;
 }
@@ -382,7 +393,7 @@
     ValueRange getResultDynamicDims(unsigned idx) { return sizes(); }
   }];
 
-  let verifier = [{ return verifyDispatchTensorLoadOp(*this); }];
+  let hasVerifier = 1;
 
   let hasCanonicalizer = 1;
   let hasFolder = 1;
@@ -460,7 +471,7 @@
     ValueRange getResultDynamicDims(unsigned idx) { return {}; }
   }];
 
-  let verifier = [{ return verifyDispatchTensorStoreOp(*this); }];
+  let hasVerifier = 1;
 
   let hasCanonicalizer = 1;
 }
@@ -530,7 +541,7 @@
     }
   }];
 
-  let verifier = [{ return verifyExecutableOp(*this); }];
+  let hasVerifier = 1;
 }
 
 def FLOW_ExecutableEndOp : FLOW_Op<"executable_end", [
@@ -643,7 +654,7 @@
                                $tied_operands)
   }];
 
-  let verifier = [{ return verifyDispatchOp(*this); }];
+  let hasVerifier = 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -704,7 +715,7 @@
     ValueRange getResultDynamicDims(unsigned idx) { return dynamic_dims(); }
   }];
 
-  let verifier = [{ return verifyTensorTieShapeOp(*this); }];
+  let hasVerifier = 1;
 
   let hasCanonicalizer = 1;
   let hasFolder = 1;
@@ -1075,7 +1086,7 @@
     ValueRange getResultDynamicDims(unsigned idx) { return target_dims(); }
   }];
 
-  let verifier = [{ return verifyTensorUpdateOp(*this); }];
+  let hasVerifier = 1;
 
   let hasCanonicalizer = 1;
   let hasFolder = 1;
diff --git a/iree/compiler/Dialect/HAL/IR/HALBase.td b/iree/compiler/Dialect/HAL/IR/HALBase.td
index 93d2e79..4e89a05 100644
--- a/iree/compiler/Dialect/HAL/IR/HALBase.td
+++ b/iree/compiler/Dialect/HAL/IR/HALBase.td
@@ -9,6 +9,7 @@
 
 include "iree/compiler/Dialect/HAL/IR/HALDialect.td"
 include "iree/compiler/Dialect/HAL/IR/HALInterfaces.td"
+include "mlir/IR/AttrTypeBase.td"
 
 //===----------------------------------------------------------------------===//
 // HAL enums
@@ -575,6 +576,7 @@
     static SmallVector<ExecutableTargetAttr, 4>
     lookupExecutableTargets(Operation *op);
   }];
+  let hasCustomAssemblyFormat = 1;
 }
 
 def HAL_ExecutableTargetAttr :
@@ -624,6 +626,7 @@
     // device that can load an executable of this target.
     Attribute getMatchExpression();
   }];
+  let hasCustomAssemblyFormat = 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -660,6 +663,7 @@
       return $_get(context, ArrayAttr::get(context, conditions));
     }]>,
   ];
+  let hasCustomAssemblyFormat = 1;
 }
 
 def HAL_MatchAllAttr :
@@ -679,6 +683,7 @@
       return $_get(context, ArrayAttr::get(context, conditions));
     }]>,
   ];
+  let hasCustomAssemblyFormat = 1;
 }
 
 def HAL_DeviceMatchIDAttr :
@@ -704,6 +709,7 @@
       return $_get(target.getContext(), target.getDeviceID());
     }]>,
   ];
+  let hasCustomAssemblyFormat = 1;
 }
 
 def HAL_DeviceMatchFeatureAttr :
@@ -727,6 +733,7 @@
       return $_get(pattern.getContext(), pattern);
     }]>,
   ];
+  let hasCustomAssemblyFormat = 1;
 }
 
 def HAL_DeviceMatchArchitectureAttr :
@@ -750,6 +757,7 @@
       return $_get(pattern.getContext(), pattern);
     }]>,
   ];
+  let hasCustomAssemblyFormat = 1;
 }
 
 def HAL_DeviceMatchExecutableFormatAttr :
@@ -786,6 +794,8 @@
       return $_get(target.getContext(), target.getFormat());
     }]>,
   ];
+
+  let hasCustomAssemblyFormat = 1;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/iree/compiler/Dialect/Stream/IR/StreamBase.td b/iree/compiler/Dialect/Stream/IR/StreamBase.td
index 24caa45..80174c3 100644
--- a/iree/compiler/Dialect/Stream/IR/StreamBase.td
+++ b/iree/compiler/Dialect/Stream/IR/StreamBase.td
@@ -10,6 +10,7 @@
 include "iree/compiler/Dialect/Stream/IR/StreamInterfaces.td"
 include "iree/compiler/Dialect/Util/IR/UtilBase.td"
 include "iree/compiler/Dialect/Util/IR/UtilInterfaces.td"
+include "mlir/IR/AttrTypeBase.td"
 include "mlir/IR/SubElementInterfaces.td"
 
 //===----------------------------------------------------------------------===//
@@ -109,8 +110,7 @@
 
 class Stream_Op<string mnemonic, list<Trait> traits = []> :
     Op<Stream_Dialect, mnemonic, traits> {
-  let parser = [{ return parse$cppClass(parser, &result); }];
-  let printer = [{ return print$cppClass(p, *this); }];
+  let hasCustomAssemblyFormat = 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -218,6 +218,8 @@
     // `stream.partitioning` attribute is found.
     static PartitioningConfigAttr lookup(Operation *op);
   }];
+
+  let hasCustomAssemblyFormat = 1;
 }
 
 def Stream_ResourceConfigAttr :
@@ -269,6 +271,8 @@
     // configuration, or as a fallback returns a conservative configuration.
     static ResourceConfigAttr lookup(Operation *op);
   }];
+
+  let hasCustomAssemblyFormat = 1;
 }
 
 def Stream_ResourceAccess_None : BitEnumAttrCase<"None", 0x0000>;
@@ -319,6 +323,7 @@
         $_builder.getContext(),
         IREE::Stream::TimepointType::get($_builder.getContext()));
   }];
+  let hasCustomAssemblyFormat = 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -399,6 +404,8 @@
       return $_get(lifetime.getContext(), lifetime.getValue());
     }]>,
   ];
+
+  let hasCustomAssemblyFormat = 1;
 }
 
 def Stream_ResourceLifetimeUnknown : CPred<[{
diff --git a/iree/compiler/Dialect/Util/IR/UtilAttrs.td b/iree/compiler/Dialect/Util/IR/UtilAttrs.td
index 1e6f5e4..0bb9baa 100644
--- a/iree/compiler/Dialect/Util/IR/UtilAttrs.td
+++ b/iree/compiler/Dialect/Util/IR/UtilAttrs.td
@@ -9,6 +9,7 @@
 
 include "iree/compiler/Dialect/Util/IR/UtilBase.td"
 include "iree/compiler/Dialect/Util/IR/UtilInterfaces.td"
+include "mlir/IR/AttrTypeBase.td"
 include "mlir/IR/OpBase.td"
 include "mlir/IR/SubElementInterfaces.td"
 
@@ -27,6 +28,8 @@
     AttrParameter<"int64_t", "">:$offset,
     AttrParameter<"int64_t", "">:$length
   );
+
+  let hasCustomAssemblyFormat = 1;
 }
 
 def Util_CompositeAttr : AttrDef<Util_Dialect, "Composite", [
@@ -65,6 +68,8 @@
   }];
 
   let genVerifyDecl = 1;
+
+  let hasCustomAssemblyFormat = 1;
 }
 
 #endif  // IREE_DIALECT_UTIL_IR_UTIL_ATTRS
diff --git a/iree/compiler/Dialect/Util/IR/UtilOps.cpp b/iree/compiler/Dialect/Util/IR/UtilOps.cpp
index fcbf88b..45211fd 100644
--- a/iree/compiler/Dialect/Util/IR/UtilOps.cpp
+++ b/iree/compiler/Dialect/Util/IR/UtilOps.cpp
@@ -608,19 +608,20 @@
   }
 }
 
-static LogicalResult verifyDoNotOptimizeOp(DoNotOptimizeOp op) {
-  if (op.getNumOperands() != op.getNumResults()) {
-    return op.emitOpError()
+LogicalResult DoNotOptimizeOp::verify() {
+  Operation *op = getOperation();
+  if (op->getNumOperands() != op->getNumResults()) {
+    return op->emitOpError()
            << "must have same number of operands and results, but has "
-           << op.getNumOperands() << " and " << op.getNumResults()
+           << op->getNumOperands() << " and " << op->getNumResults()
            << ", respectively";
   }
 
-  for (int i = 0, e = op.getNumOperands(); i < e; ++i) {
-    if (op.getOperand(i).getType() != op.getResult(i).getType()) {
-      op.emitOpError() << "must have same operand and result types, but they "
-                          "differ at index "
-                       << i;
+  for (int i = 0, e = op->getNumOperands(); i < e; ++i) {
+    if (op->getOperand(i).getType() != op->getResult(i).getType()) {
+      op->emitOpError() << "must have same operand and result types, but they "
+                           "differ at index "
+                        << i;
     }
   }
 
@@ -770,14 +771,15 @@
   build(builder, result, name, isMutable, type, llvm::None, attrs);
 }
 
-static LogicalResult verifyGlobalOp(GlobalOp op) {
-  if (op.initial_value().hasValue()) {
+LogicalResult GlobalOp::verify() {
+  Operation *op = getOperation();
+  if (initial_value().hasValue()) {
     // Ensure the value is something we can convert to a const.
-    if (!isGlobalTypeCompatible(op.type(), op.initial_valueAttr().getType())) {
+    if (!isGlobalTypeCompatible(type(), initial_valueAttr().getType())) {
       return op->emitOpError()
-             << "initial value type mismatch; global " << op.getSymbolName()
-             << " is " << op.type() << " but initial value provided is "
-             << op.initial_valueAttr().getType();
+             << "initial value type mismatch; global " << getSymbolName()
+             << " is " << type() << " but initial value provided is "
+             << initial_valueAttr().getType();
     }
   }
   return success();
@@ -795,10 +797,11 @@
   setNameFn(result(), Twine("ptr_" + global()).str());
 }
 
-static LogicalResult verifyGlobalAddressOp(GlobalAddressOp op) {
-  auto globalOp = op.getGlobalOp();
+LogicalResult GlobalAddressOp::verify() {
+  Operation *op = getOperation();
+  auto globalOp = getGlobalOp();
   if (!globalOp) {
-    return op.emitOpError() << "undefined global: " << op.global();
+    return op->emitOpError() << "undefined global: " << global();
   }
   return success();
 }
@@ -836,27 +839,29 @@
   }
 }
 
-static LogicalResult verifyGlobalLoadOp(GlobalLoadOp op) {
-  auto globalOp = op.getGlobalOp();
+LogicalResult GlobalLoadOp::verify() {
+  Operation *op = getOperation();
+  auto globalOp = getGlobalOp();
   if (!globalOp) {
-    return op->emitOpError() << "undefined global: " << op.global();
+    return op->emitOpError() << "undefined global: " << global();
   }
   auto loadType = op->getResult(0).getType();
   if (!isGlobalTypeCompatible(globalOp.type(), loadType)) {
     return op->emitOpError()
-           << "global type mismatch; global " << op.global() << " is "
+           << "global type mismatch; global " << global() << " is "
            << globalOp.type() << " but load is " << loadType;
   }
   return success();
 }
 
-static LogicalResult verifyGlobalLoadIndirectOp(GlobalLoadIndirectOp &op) {
+LogicalResult GlobalLoadIndirectOp::verify() {
+  Operation *op = getOperation();
   auto globalType =
-      op.global().getType().cast<IREE::Util::PtrType>().getTargetType();
-  auto loadType = op.result().getType();
+      global().getType().cast<IREE::Util::PtrType>().getTargetType();
+  auto loadType = result().getType();
   if (!isGlobalTypeCompatible(globalType, loadType)) {
-    return op.emitOpError() << "global type mismatch; global pointer is "
-                            << globalType << " but load is " << loadType;
+    return op->emitOpError() << "global type mismatch; global pointer is "
+                             << globalType << " but load is " << loadType;
   }
   return success();
 }
@@ -868,34 +873,36 @@
 
 FlatSymbolRefAttr GlobalStoreOp::getGlobalRefAttr() { return globalAttr(); }
 
-static LogicalResult verifyGlobalStoreOp(GlobalStoreOp op) {
-  auto globalOp = op.getGlobalOp();
+LogicalResult GlobalStoreOp::verify() {
+  Operation *op = getOperation();
+  auto globalOp = getGlobalOp();
   if (!globalOp) {
-    return op->emitOpError() << "undefined global: " << op.global();
+    return op->emitOpError() << "undefined global: " << global();
   }
   auto storeType = op->getOperand(0).getType();
   if (globalOp.type() != storeType) {
     return op->emitOpError()
-           << "global type mismatch; global " << op.global() << " is "
+           << "global type mismatch; global " << global() << " is "
            << globalOp.type() << " but store is " << storeType;
   }
   if (!globalOp.isMutable()) {
     // Allow stores to immutable globals in initializers.
     if (!op->getParentOfType<InitializerOp>()) {
-      return op->emitOpError() << "global " << op.global()
+      return op->emitOpError() << "global " << global()
                                << " is not mutable and cannot be stored to";
     }
   }
   return success();
 }
 
-static LogicalResult verifyGlobalStoreIndirectOp(GlobalStoreIndirectOp &op) {
+LogicalResult GlobalStoreIndirectOp::verify() {
+  Operation *op = getOperation();
   auto globalType =
-      op.global().getType().cast<IREE::Util::PtrType>().getTargetType();
-  auto storeType = op.value().getType();
+      global().getType().cast<IREE::Util::PtrType>().getTargetType();
+  auto storeType = value().getType();
   if (!isGlobalTypeCompatible(globalType, storeType)) {
-    return op.emitOpError() << "global type mismatch; global pointer is "
-                            << globalType << " but store is " << storeType;
+    return op->emitOpError() << "global type mismatch; global pointer is "
+                             << globalType << " but store is " << storeType;
   }
   return success();
 }
@@ -975,24 +982,26 @@
   }
 }
 
-static LogicalResult verifyListGetOp(ListGetOp &op) {
-  auto listType = op.list().getType().cast<IREE::Util::ListType>();
+LogicalResult ListGetOp::verify() {
+  Operation *op = getOperation();
+  auto listType = list().getType().cast<IREE::Util::ListType>();
   auto elementType = listType.getElementType();
-  auto resultType = op.result().getType();
+  auto resultType = result().getType();
   if (!ListType::canImplicitlyCast(elementType, resultType)) {
-    return op.emitError() << "list contains " << elementType
-                          << " and cannot be accessed as " << resultType;
+    return op->emitError() << "list contains " << elementType
+                           << " and cannot be accessed as " << resultType;
   }
   return success();
 }
 
-static LogicalResult verifyListSetOp(ListSetOp &op) {
-  auto listType = op.list().getType().cast<IREE::Util::ListType>();
+LogicalResult ListSetOp::verify() {
+  Operation *op = getOperation();
+  auto listType = list().getType().cast<IREE::Util::ListType>();
   auto elementType = listType.getElementType();
-  auto valueType = op.value().getType();
+  auto valueType = value().getType();
   if (!ListType::canImplicitlyCast(valueType, elementType)) {
-    return op.emitError() << "list contains " << elementType
-                          << " and cannot be mutated as " << valueType;
+    return op->emitError() << "list contains " << elementType
+                           << " and cannot be mutated as " << valueType;
   }
   return success();
 }
diff --git a/iree/compiler/Dialect/Util/IR/UtilOps.td b/iree/compiler/Dialect/Util/IR/UtilOps.td
index 545286d..08f6782 100644
--- a/iree/compiler/Dialect/Util/IR/UtilOps.td
+++ b/iree/compiler/Dialect/Util/IR/UtilOps.td
@@ -308,7 +308,7 @@
   }];
   let arguments = (ins Variadic<AnyType>:$arguments);
   let results = (outs Variadic<AnyType>:$results);
-  let verifier = [{ return verify$cppClass(*this); }];
+  let hasVerifier = 1;
   let builders = [
     OpBuilder<(ins
       "ValueRange":$operands,
@@ -488,7 +488,7 @@
     void setInitialValue(Attribute attr) { (*this)->setAttr("initial_value", attr); }
   }];
 
-  let verifier = [{ return verifyGlobalOp(*this); }];
+  let hasVerifier = 1;
 
   let hasCanonicalizer = 1;
 }
@@ -517,6 +517,7 @@
   let extraClassDeclaration = [{
     IREE::Util::GlobalOp getGlobalOp();
   }];
+  let hasVerifier = 1;
 }
 
 def Util_GlobalLoadOp : Util_Op<"global.load", [
@@ -553,7 +554,7 @@
     bool isGlobalImmutable();
   }];
 
-  let verifier = [{ return verifyGlobalLoadOp(*this); }];
+  let hasVerifier = 1;
 }
 
 def Util_GlobalLoadIndirectOp : Util_Op<"global.load.indirect"> {
@@ -573,7 +574,7 @@
     $global attr-dict `:` type($global) `->` type($result)
   }];
 
-  let verifier = [{ return verifyGlobalLoadIndirectOp(*this); }];
+  let hasVerifier = 1;
 
   let hasCanonicalizer = 1;
 }
@@ -599,7 +600,7 @@
     IREE::Util::GlobalOp getGlobalOp();
   }];
 
-  let verifier = [{ return verifyGlobalStoreOp(*this); }];
+  let hasVerifier = 1;
 
   let hasCanonicalizer = 1;
 }
@@ -619,7 +620,7 @@
     $value `,` $global attr-dict `:` type($value) `->` type($global)
   }];
 
-  let verifier = [{ return verifyGlobalStoreIndirectOp(*this); }];
+  let hasVerifier = 1;
 
   let hasCanonicalizer = 1;
 }
@@ -698,7 +699,7 @@
 
   let assemblyFormat = "$list `[` $index `]` attr-dict `:` custom<ListTypeGet>(type($list), type($result))";
 
-  let verifier = [{ return verify$cppClass(*this); }];
+  let hasVerifier = 1;
 }
 
 def Util_ListSetOp : Util_Op<"list.set", [MemoryEffects<[MemWrite]>]> {
@@ -715,7 +716,7 @@
 
   let assemblyFormat = "$list `[` $index `]` `,` $value attr-dict `:` custom<ListTypeSet>(type($list), type($value))";
 
-  let verifier = [{ return verify$cppClass(*this); }];
+  let hasVerifier = 1;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/iree/compiler/Dialect/VM/IR/VMOps.cpp b/iree/compiler/Dialect/VM/IR/VMOps.cpp
index 3195ddf..9dfa11c 100644
--- a/iree/compiler/Dialect/VM/IR/VMOps.cpp
+++ b/iree/compiler/Dialect/VM/IR/VMOps.cpp
@@ -70,7 +70,7 @@
       mlir::SymbolTable::getSymbolAttrName(), builder.getStringAttr(name)));
 }
 
-static LogicalResult verifyModuleOp(ModuleOp op) {
+LogicalResult ModuleOp::verify() {
   // TODO(benvanik): check export name conflicts.
   return success();
 }
@@ -361,7 +361,7 @@
 // Globals
 //===----------------------------------------------------------------------===//
 
-static LogicalResult verifyGlobalOp(Operation *op) {
+LogicalResult verifyGlobalOp(Operation *op) {
   auto globalName =
       op->getAttrOfType<StringAttr>(SymbolTable::getSymbolAttrName());
   auto globalType = op->getAttrOfType<TypeAttr>("type");
@@ -398,11 +398,11 @@
   return success();
 }
 
-static LogicalResult verifyGlobalAddressOp(GlobalAddressOp op) {
-  auto *globalOp =
-      op->getParentOfType<VM::ModuleOp>().lookupSymbol(op.global());
+LogicalResult GlobalAddressOp::verify() {
+  Operation *op = getOperation();
+  auto *globalOp = op->getParentOfType<VM::ModuleOp>().lookupSymbol(global());
   if (!globalOp) {
-    return op.emitOpError() << "Undefined global: " << op.global();
+    return op->emitOpError() << "Undefined global: " << global();
   }
   return success();
 }
@@ -466,7 +466,7 @@
   setResultName(setNameFn, getResult(), global());
 }
 
-static LogicalResult verifyGlobalLoadOp(Operation *op) {
+LogicalResult verifyGlobalLoadOp(Operation *op) {
   auto globalAttr = op->getAttrOfType<FlatSymbolRefAttr>("global");
   auto *globalOp =
       op->getParentOfType<VM::ModuleOp>().lookupSymbol(globalAttr.getValue());
@@ -491,7 +491,7 @@
   return funcOp.getName() == "__init" || funcOp.getName() == "__deinit";
 }
 
-static LogicalResult verifyGlobalStoreOp(Operation *op) {
+LogicalResult verifyGlobalStoreOp(Operation *op) {
   auto globalAttr = op->getAttrOfType<FlatSymbolRefAttr>("global");
   auto *globalOp =
       op->getParentOfType<VM::ModuleOp>().lookupSymbol(globalAttr.getValue());
@@ -769,11 +769,11 @@
   result.addAttributes(attrs);
 }
 
-static LogicalResult verifyConstRefRodataOp(ConstRefRodataOp &op) {
-  auto *rodataOp =
-      op->getParentOfType<VM::ModuleOp>().lookupSymbol(op.rodata());
+LogicalResult ConstRefRodataOp::verify() {
+  Operation *op = getOperation();
+  auto *rodataOp = op->getParentOfType<VM::ModuleOp>().lookupSymbol(rodata());
   if (!rodataOp) {
-    return op.emitOpError() << "Undefined rodata section: " << op.rodata();
+    return op->emitOpError() << "Undefined rodata section: " << rodata();
   }
   return success();
 }
@@ -803,52 +803,55 @@
 // Lists
 //===----------------------------------------------------------------------===//
 
-static LogicalResult verifyListGetRefOp(ListGetRefOp &op) {
-  auto listType = op.list()
+LogicalResult ListGetRefOp::verify() {
+  Operation *op = getOperation();
+  auto listType = list()
                       .getType()
                       .cast<IREE::VM::RefType>()
                       .getObjectType()
                       .cast<IREE::VM::ListType>();
   auto elementType = listType.getElementType();
-  auto resultType = op.result().getType();
+  auto resultType = result().getType();
   if (!elementType.isa<IREE::VM::OpaqueType>()) {
     if (elementType.isa<IREE::VM::RefType>() !=
         resultType.isa<IREE::VM::RefType>()) {
       // Attempting to go between a primitive type and ref type.
-      return op.emitError() << "cannot convert between list type "
-                            << elementType << " and result type " << resultType;
+      return op->emitError()
+             << "cannot convert between list type " << elementType
+             << " and result type " << resultType;
     } else if (auto refType = elementType.dyn_cast<IREE::VM::RefType>()) {
       if (!refType.getObjectType().isa<IREE::VM::OpaqueType>() &&
           elementType != resultType) {
         // List has a concrete type, verify it matches.
-        return op.emitError() << "list contains " << elementType
-                              << " that cannot be accessed as " << resultType;
+        return op->emitError() << "list contains " << elementType
+                               << " that cannot be accessed as " << resultType;
       }
     }
   }
   return success();
 }
 
-static LogicalResult verifyListSetRefOp(ListSetRefOp &op) {
-  auto listType = op.list()
+LogicalResult ListSetRefOp::verify() {
+  Operation *op = getOperation();
+  auto listType = list()
                       .getType()
                       .cast<IREE::VM::RefType>()
                       .getObjectType()
                       .cast<IREE::VM::ListType>();
   auto elementType = listType.getElementType();
-  auto valueType = op.value().getType();
+  auto valueType = value().getType();
   if (!elementType.isa<IREE::VM::OpaqueType>()) {
     if (elementType.isa<IREE::VM::RefType>() !=
         valueType.isa<IREE::VM::RefType>()) {
       // Attempting to go between a primitive type and ref type.
-      return op.emitError() << "cannot convert between list type "
-                            << elementType << " and value type " << valueType;
+      return op->emitError() << "cannot convert between list type "
+                             << elementType << " and value type " << valueType;
     } else if (auto refType = elementType.dyn_cast<IREE::VM::RefType>()) {
       if (!refType.getObjectType().isa<IREE::VM::OpaqueType>() &&
           elementType != valueType) {
         // List has a concrete type, verify it matches.
-        return op.emitError() << "list contains " << elementType
-                              << " that cannot be mutated as " << valueType;
+        return op->emitError() << "list contains " << elementType
+                               << " that cannot be mutated as " << valueType;
       }
     }
   }
@@ -1259,12 +1262,11 @@
                             : falseDestOperandsMutable();
 }
 
-template <typename T>
-static LogicalResult verifyFailOp(T op) {
+LogicalResult verifyFailOp(Operation *op, Value statusVal) {
   APInt status;
-  if (matchPattern(op.status(), m_ConstantInt(&status))) {
+  if (matchPattern(statusVal, m_ConstantInt(&status))) {
     if (status == 0) {
-      return op.emitOpError() << "status is 0; expected to not be OK";
+      return op->emitOpError() << "status is 0; expected to not be OK";
     }
   }
   return success();
diff --git a/iree/compiler/Dialect/VM/IR/VMOps.h b/iree/compiler/Dialect/VM/IR/VMOps.h
index 7a16dfc..7fc704a 100644
--- a/iree/compiler/Dialect/VM/IR/VMOps.h
+++ b/iree/compiler/Dialect/VM/IR/VMOps.h
@@ -23,6 +23,28 @@
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 
+namespace mlir {
+namespace iree_compiler {
+namespace IREE {
+namespace VM {
+
+/// Generic method for verifying VM fail ops.
+LogicalResult verifyFailOp(Operation *op, Value statusVal);
+
+/// Generic method for verifying VM global ops.
+LogicalResult verifyGlobalOp(Operation *op);
+
+/// Generic method for verifying VM global load ops.
+LogicalResult verifyGlobalLoadOp(Operation *op);
+
+/// Generic method for verifying VM global store ops.
+LogicalResult verifyGlobalStoreOp(Operation *op);
+
+}  // namespace VM
+}  // namespace IREE
+}  // namespace iree_compiler
+}  // namespace mlir
+
 #define GET_OP_CLASSES
 #include "iree/compiler/Dialect/VM/IR/VMOps.h.inc"  // IWYU pragma: export
 
diff --git a/iree/compiler/Dialect/VM/IR/VMOps.td b/iree/compiler/Dialect/VM/IR/VMOps.td
index 87323d6..9c06818 100644
--- a/iree/compiler/Dialect/VM/IR/VMOps.td
+++ b/iree/compiler/Dialect/VM/IR/VMOps.td
@@ -61,7 +61,7 @@
     Block& getBlock() { return this->getOperation()->getRegion(0).front(); }
   }];
 
-  let verifier = [{ return verifyModuleOp(*this); }];
+  let hasVerifier = 1;
 }
 
 def VM_ModuleTerminatorOp : VM_Op<"module_terminator", [
@@ -358,9 +358,8 @@
     void setInitialValue(Attribute value) { (*this)->setAttr("initial_value", (value)); }
     void clearInitialValue() { (*this)->removeAttr("initial_value"); }
     Optional<IntegerAttr> getOrdinalAttr() { return ordinalAttr(); }
+    LogicalResult verify() { return verifyGlobalOp(getOperation()); }
   }];
-
-  let verifier = [{ return verifyGlobalOp(*this); }];
 }
 
 def VM_GlobalI32Op : VM_GlobalOp<"global.i32",
@@ -438,7 +437,7 @@
 
   let assemblyFormat = "$global attr-dict `:` type($result)";
 
-  let verifier = [{ return verifyGlobalAddressOp(*this); }];
+  let hasVerifier = 1;
 }
 
 class VM_GlobalLoadOp<Type type, string mnemonic, list<Trait> traits = []> :
@@ -456,7 +455,11 @@
 
   let assemblyFormat = "$global attr-dict `:` type($value)";
 
-  let verifier = [{ return verifyGlobalLoadOp(*this); }];
+  let extraClassDeclaration = [{
+    LogicalResult verify() {
+      return verifyGlobalLoadOp(getOperation());
+    }
+  }];
 }
 
 class VM_GlobalLoadPrimitiveOp<Type type, string mnemonic, VM_OPC opcode,
@@ -484,7 +487,11 @@
 
   let assemblyFormat = "$value `,` $global attr-dict `:` type($value)";
 
-  let verifier = [{ return verifyGlobalStoreOp(*this); }];
+  let extraClassDeclaration = [{
+    LogicalResult verify() {
+      return verifyGlobalStoreOp(getOperation());
+    }
+  }];
 }
 
 class VM_GlobalStorePrimitiveOp<Type type, string mnemonic, VM_OPC opcode,
@@ -1010,7 +1017,7 @@
       CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>,
   ];
 
-  let verifier = [{ return verify$cppClass(*this); }];
+  let hasVerifier = 1;
 }
 
 def VM_RodataInlineOp : VM_PureOp<"rodata.inline", [
@@ -1616,7 +1623,7 @@
     VM_EncResult<"result">,
   ];
 
-  let verifier = [{ return verify$cppClass(*this); }];
+  let hasVerifier = 1;
 }
 
 def VM_ListSetRefOp :
@@ -1644,7 +1651,7 @@
     VM_EncOperand<"value", 2>,
   ];
 
-  let verifier = [{ return verify$cppClass(*this); }];
+  let hasVerifier = 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -3679,7 +3686,11 @@
     }]>,
   ];
 
-  let verifier = [{ return verifyFailOp(*this); }];
+  let extraClassDeclaration = [{
+    LogicalResult verify() {
+      return verifyFailOp(getOperation(), status());
+    }
+  }];
 }
 
 def VM_CondFailOp : VM_Op<"cond_fail", [
@@ -3729,7 +3740,11 @@
     }]>,
   ];
 
-  let verifier = [{ return verifyFailOp(*this); }];
+  let extraClassDeclaration = [{
+    LogicalResult verify() {
+      return verifyFailOp(getOperation(), status());
+    }
+  }];
 
   let hasCanonicalizer = 1;
 }
diff --git a/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/Input/InputBase.td b/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/Input/InputBase.td
index 3990bd8..0d6565d 100644
--- a/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/Input/InputBase.td
+++ b/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/Input/InputBase.td
@@ -8,6 +8,7 @@
 #define IREE_DIALECTS_DIALECT_INPUT_BASE_TD
 
 include "mlir/IR/OpBase.td"
+include "mlir/IR/AttrTypeBase.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 
 def IREEInput_Dialect : Dialect {
diff --git a/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/Input/InputDialect.td b/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/Input/InputDialect.td
index cde0652..a60a1c6 100644
--- a/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/Input/InputDialect.td
+++ b/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/Input/InputDialect.td
@@ -61,17 +61,7 @@
 
   let parameters = (ins IREEInput_ElementTypeParameter:$elementType);
 
-  let printer = [{
-    $_printer << "<" << getElementType() << ">";
-  }];
-
-  let parser = [{
-    Type elementType;
-    if ($_parser.parseLess() || $_parser.parseType(elementType) ||
-        $_parser.parseGreater())
-      return Type();
-    return get($_ctxt, elementType);
-  }];
+  let hasCustomAssemblyFormat = 1;
 }
 
 def IREEInput_PtrType : IREEInput_Type<"Ptr"> {
@@ -80,17 +70,7 @@
   let summary = "Pointer to a concrete type";
   let parameters = (ins IREEInput_PtrTargetTypeParameter:$targetType);
 
-  let printer = [{
-    $_printer << "<" << getTargetType() << ">";
-  }];
-
-  let parser = [{
-    Type targetType;
-    if ($_parser.parseLess() || $_parser.parseType(targetType) ||
-        $_parser.parseGreater())
-      return Type();
-    return get($_ctxt, targetType);
-  }];
+  let hasCustomAssemblyFormat = 1;
 }
 
 #endif // IREE_DIALECTS_DIALECT_INPUT_DIALECT_TD
diff --git a/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.td b/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.td
index 75c0cad..8b7bd97 100644
--- a/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.td
+++ b/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.td
@@ -31,9 +31,8 @@
          LinalgExtInterface,
          SingleBlockImplicitTerminator<"::mlir::iree_compiler::IREE::LinalgExt::YieldOp">
   ])> {
-  let verifier = [{ return verify$cppClass(*this); }];
-  let printer = [{ return print$cppClass(p, *this); }];
-  let parser = [{ return parse$cppClass(parser, result); }];
+  let hasVerifier = 1;
+  let hasCustomAssemblyFormat = 1;
   code extraLinalgExtOpClassDeclaration = [{
     SmallVector<Value> getDestinationOperands(OpBuilder &b) {
       SmallVector<Value> dest(outputs().begin(), outputs().end());
diff --git a/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/PyDM/IR/PyDMBase.td b/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/PyDM/IR/PyDMBase.td
index 4f20e1d..c1d53cb 100644
--- a/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/PyDM/IR/PyDMBase.td
+++ b/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/PyDM/IR/PyDMBase.td
@@ -8,6 +8,7 @@
 #define IREE_DIALECTS_DIALECT_PYDM_IR_PYDM_BASE_TD
 
 include "mlir/IR/OpBase.td"
+include "mlir/IR/AttrTypeBase.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 
 def IREEPyDM_Dialect : Dialect {
@@ -34,14 +35,12 @@
 }
 
 class IREEPyDM_Op<string mnemonic, list<Trait> traits = []> :
-    Op<IREEPyDM_Dialect, mnemonic, traits> {
-  let verifier = [{ return ::verify(*this); }];
-}
+    Op<IREEPyDM_Dialect, mnemonic, traits> {}
 
 class IREEPyDM_PureOp<string mnemonic, list<Trait> traits = []> :
-    Op<IREEPyDM_Dialect, mnemonic, !listconcat(traits, [NoSideEffect])> {
-  let verifier = [{ return ::verify(*this); }];
-}
-class IREEPyDM_TypeDef<string name, list<Trait> traits = []> : TypeDef<IREEPyDM_Dialect, name, traits>;
+    Op<IREEPyDM_Dialect, mnemonic, !listconcat(traits, [NoSideEffect])> {}
+
+class IREEPyDM_TypeDef<string name, list<Trait> traits = []> : 
+  TypeDef<IREEPyDM_Dialect, name, traits>;
 
 #endif // IREE_DIALECTS_DIALECT_PYDM_IR_PYDM_BASE_TD
diff --git a/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/PyDM/IR/PyDMDialect.td b/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/PyDM/IR/PyDMDialect.td
index 73313b6..ef6d862 100644
--- a/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/PyDM/IR/PyDMDialect.td
+++ b/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/PyDM/IR/PyDMDialect.td
@@ -150,51 +150,7 @@
     bool isSigned() const;
   }];
 
-  let printer = [{
-    auto w = getImpl()->bitWidth;
-    if (w) {
-      $_printer << "<";
-      if (*w == 0) {
-        $_printer << "*";
-      } else if (*w > 0) {
-        $_printer << *w;
-      } else {
-        $_printer << "unsigned " << (-*w);
-      }
-      $_printer << ">";
-    }
-  }];
-
-  let parser = [{
-    auto emitError = [&]() -> InFlightDiagnostic{
-      return $_parser.emitError($_parser.getCurrentLocation());
-    };
-    // Weak
-    if (failed($_parser.parseOptionalLess()))
-      return get($_ctxt);
-    // AP
-    if (succeeded($_parser.parseOptionalStar())) {
-      if (failed($_parser.parseGreater()))
-        return Type();
-      return get($_ctxt, None);
-    }
-
-    // Explicit
-    bool isSigned;
-    if (succeeded($_parser.parseOptionalKeyword("unsigned"))) {
-      isSigned = false;
-    } else {
-      isSigned = true;
-    }
-
-    int width;
-    if (failed($_parser.parseInteger(width)))
-      return Type();
-    if (failed($_parser.parseGreater()))
-      return Type();
-    if (!isSigned) width = -width;
-    return getChecked(emitError, $_ctxt, width);
-  }];
+  let hasCustomAssemblyFormat = 1;
 }
 
 def IREEPyDM_ListType : IREEPyDM_PrimitiveTypeDef<"List", ["isRefinable"]> {
@@ -216,59 +172,7 @@
       return Base::get($_ctxt, CollectionStorageClass::Boxed, nullptr);
     }]>
   ];
-
-  let printer = [{
-    if (getImpl()->uniformElementType ||
-        getImpl()->storageClass != CollectionStorageClass::Boxed) {
-      $_printer << "<";
-      switch (getImpl()->storageClass) {
-        case CollectionStorageClass::Boxed:
-          $_printer << "boxed";
-          break;
-        case CollectionStorageClass::Empty:
-          $_printer << "empty";
-          break;
-        case CollectionStorageClass::Unboxed:
-          $_printer << "unboxed";
-          break;
-      }
-
-      if (getImpl()->uniformElementType) {
-        $_printer << ",";
-        $_printer << getImpl()->uniformElementType;
-      }
-      $_printer << ">";
-    }
-  }];
-
-  let parser = [{
-    if ($_parser.parseOptionalLess())
-      return get($_ctxt, CollectionStorageClass::Boxed, nullptr);
-
-    Type t;
-    StringRef storageClassKeyword;
-    if ($_parser.parseKeyword(&storageClassKeyword))
-      return Type();
-    if ($_parser.parseComma())
-      return Type();
-    if ($_parser.parseType(t))
-      return Type();
-    if ($_parser.parseGreater())
-      return Type();
-
-    CollectionStorageClass storageClass;
-    if (storageClassKeyword == "boxed")
-      storageClass = CollectionStorageClass::Boxed;
-    else if (storageClassKeyword == "empty")
-      storageClass = CollectionStorageClass::Empty;
-    else if (storageClassKeyword == "unboxed")
-      storageClass = CollectionStorageClass::Unboxed;
-    else {
-      $_parser.emitError($_parser.getCurrentLocation(), "expected one of 'boxed', 'empty', 'unboxed'");
-      return Type();
-    }
-    return get($_ctxt, storageClass, t);
-  }];
+  let hasCustomAssemblyFormat = 1;
 
   let extraClassDeclaration = [{
     /// Gets the type used to store elements in the backing list.
@@ -330,28 +234,7 @@
     bool isWeak() const;
     bool isExplicit() const { return !isWeak(); }
   }];
-
-  let printer = [{
-    auto ft = getImpl()->floatType;
-    if (ft)
-      $_printer << "<" << ft << ">";
-  }];
-
-  let parser = [{
-    auto emitError = [&]() -> InFlightDiagnostic{
-      return $_parser.emitError($_parser.getCurrentLocation());
-    };
-    // Weak
-    if (failed($_parser.parseOptionalLess()))
-      return get($_ctxt);
-    // Explicit
-    FloatType subType;
-    if (failed($_parser.parseType(subType)))
-      return Type();
-    if (failed($_parser.parseGreater()))
-      return Type();
-    return getChecked(emitError, $_ctxt, subType);
-  }];
+  let hasCustomAssemblyFormat = 1;
 }
 
 def IREEPyDM_StrType : IREEPyDM_PrimitiveTypeDef<"Str"> {
@@ -424,29 +307,7 @@
       return Base::get($_ctxt, nullptr);
     }]>
   ];
-
-  let printer = [{
-    if (getImpl()->primitiveType)
-      $_printer << "<" << getImpl()->primitiveType << ">";
-  }];
-
-  let parser = [{
-    if ($_parser.parseOptionalLess())
-      return get($_ctxt, nullptr);
-
-    Type t;
-    if ($_parser.parseType(t))
-      return Type();
-    if ($_parser.parseGreater())
-      return Type();
-    if (auto primitiveType = t.dyn_cast<PrimitiveType>())
-      return get($_ctxt, primitiveType);
-    else {
-      $_parser.emitError(
-          $_parser.getNameLoc(), "expected a primitive type");
-      return Type();
-    }
-  }];
+  let hasCustomAssemblyFormat = 1;
 
   let extraClassDeclaration = [{
     static bool isGenericObjectType(Type t) {
@@ -479,27 +340,7 @@
   );
 
   let genVerifyDecl = 1;
-  let printer = [{
-    llvm::interleaveComma(getAlternatives(), $_printer);
-  }];
-
-  let parser = [{
-    if ($_parser.parseOptionalLess())
-      return get($_ctxt, {});
-
-    SmallVector<::mlir::Type> alternatives;
-
-    do {
-      Type type;
-      if ($_parser.parseType(type))
-        return Type();
-      alternatives.push_back(type);
-    } while (succeeded($_parser.parseOptionalComma()));
-
-    return getChecked([&]() {
-      return $_parser.emitError($_parser.getNameLoc());
-    }, $_ctxt, alternatives);
-  }];
+  let hasCustomAssemblyFormat = 1;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/PyDM/IR/PyDMOps.td b/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/PyDM/IR/PyDMOps.td
index bfa4d63..bc5b181 100644
--- a/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/PyDM/IR/PyDMOps.td
+++ b/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/PyDM/IR/PyDMOps.td
@@ -41,6 +41,7 @@
     $lhs `[` $slice `]` `=` $rhs `:` type(operands) attr-dict
   }];
   let hasCanonicalizer = 1;
+  let hasVerifier = 0;
 }
 
 //===----------------------------------------------------------------------===//
@@ -182,6 +183,8 @@
     }]>
   ];
 
+  // TODO: Enforce invariants.
+  let hasVerifier = 0;
   let hasCustomAssemblyFormat = 1;
 }
 
@@ -476,6 +479,7 @@
   let assemblyFormat = [{
     ($elements^ `:` type($elements))? `->` type(results) attr-dict
   }];
+  let hasVerifier = 1;
 }
 
 def IREEPyDM_MakeTupleOp : IREEPyDM_PureOp<"make_tuple"> {
@@ -606,6 +610,7 @@
   let results = (outs Variadic<AnyType>:$results);
   let regions = (region SizedRegion<1>:$thenRegion, AnyRegion:$elseRegion);
 
+  let hasVerifier = 1;
   let hasCustomAssemblyFormat = 1;
 }
 
diff --git a/llvm-external-projects/iree-dialects/lib/Dialect/Input/InputDialect.cpp b/llvm-external-projects/iree-dialects/lib/Dialect/Input/InputDialect.cpp
index 060d308..a12a1b9 100644
--- a/llvm-external-projects/iree-dialects/lib/Dialect/Input/InputDialect.cpp
+++ b/llvm-external-projects/iree-dialects/lib/Dialect/Input/InputDialect.cpp
@@ -29,3 +29,41 @@
 #include "iree-dialects/Dialect/Input/InputOps.cpp.inc"
       >();
 }
+
+namespace mlir {
+namespace iree_compiler {
+namespace IREE {
+namespace Input {
+
+// ListType
+Type ListType::parse(AsmParser &parser) {
+  MLIRContext *ctxt = parser.getContext();
+  Type elementType;
+  if (parser.parseLess() || parser.parseType(elementType) ||
+      parser.parseGreater())
+    return Type();
+  return get(ctxt, elementType);
+}
+
+void ListType::print(AsmPrinter &printer) const {
+  printer << "<" << getElementType() << ">";
+}
+
+// PtrType
+Type PtrType::parse(AsmParser &parser) {
+  MLIRContext *ctxt = parser.getContext();
+  Type targetType;
+  if (parser.parseLess() || parser.parseType(targetType) ||
+      parser.parseGreater())
+    return Type();
+  return get(ctxt, targetType);
+}
+
+void PtrType::print(AsmPrinter &printer) const {
+  printer << "<" << getTargetType() << ">";
+}
+
+}  // namespace Input
+}  // namespace IREE
+}  // namespace iree_compiler
+}  // namespace mlir
diff --git a/llvm-external-projects/iree-dialects/lib/Dialect/LinalgExt/IR/LinalgExtOps.cpp b/llvm-external-projects/iree-dialects/lib/Dialect/LinalgExt/IR/LinalgExtOps.cpp
index cabc5c4..af9ae07 100644
--- a/llvm-external-projects/iree-dialects/lib/Dialect/LinalgExt/IR/LinalgExtOps.cpp
+++ b/llvm-external-projects/iree-dialects/lib/Dialect/LinalgExt/IR/LinalgExtOps.cpp
@@ -104,48 +104,49 @@
 //===----------------------------------------------------------------------===//
 // ScatterOp
 //===----------------------------------------------------------------------===//
-static LogicalResult verifyScatterOp(ScatterOp op) {
-  if (op.inputs().size() != 2) {
-    return op.emitOpError("expected two input operands");
+LogicalResult ScatterOp::verify() {
+  Operation *op = getOperation();
+  if (inputs().size() != 2) {
+    return op->emitOpError("expected two input operands");
   }
-  if (op.outputs().size() != 1) {
-    return op.emitOpError("expected one output operand");
+  if (outputs().size() != 1) {
+    return op->emitOpError("expected one output operand");
   }
   auto checkDimensionsMatch = [&](ShapedType t1, ShapedType t2, unsigned dim) {
     return t1.getShape()[dim] == t2.getShape()[dim];
   };
 
-  auto indicesType = op.getIndicesType();
+  auto indicesType = getIndicesType();
   if (indicesType.getRank() != 2 ||
       !indicesType.getElementType().isInteger(32)) {
-    return op.emitOpError(
+    return op->emitOpError(
         "expected indices to be of rank 2 of i32 element type");
   }
-  auto indexDepth = op.getIndexDepth();
+  auto indexDepth = getIndexDepth();
   if (indexDepth == ShapedType::kDynamicSize) {
-    return op.emitOpError("expected index depth is static");
+    return op->emitOpError("expected index depth is static");
   }
 
   // The first dimension of the indices should match the first dimension of the
   // output. They indicate to the number of updates.
-  auto updateType = op.getUpdateType();
+  auto updateType = getUpdateType();
   if (updateType.getRank() < 1) {
-    return op.emitOpError("expected update value to be at least rank 1");
+    return op->emitOpError("expected update value to be at least rank 1");
   }
   if (!checkDimensionsMatch(indicesType, updateType, 0)) {
-    return op.emitOpError(
+    return op->emitOpError(
         "mismatch in shape of indices and update value at dim#0");
   }
-  auto originalType = op.getOriginalType();
+  auto originalType = getOriginalType();
   if (updateType.getRank() - 1 > originalType.getRank()) {
-    return op.emitOpError(
+    return op->emitOpError(
         "update value rank exceeds the rank of the original value");
   }
 
   // indexDepth + update dims should cover the original dims. The first dim of
   // update is the number of updates.
   if (originalType.getRank() > indexDepth + updateType.getRank() - 1) {
-    return op.emitOpError(
+    return op->emitOpError(
         "index depth and update value does not cover rank of original value");
   }
 
@@ -160,7 +161,7 @@
     int64_t updateDim = std::get<1>(it);
     if (updateType.getDimSize(updateDim) !=
         originalType.getDimSize(originalDim)) {
-      return op.emitOpError("mismatch in shape of update value dim#")
+      return op->emitOpError("mismatch in shape of update value dim#")
              << updateDim << " and original value at dim#" << originalDim;
     }
   }
@@ -174,36 +175,36 @@
     int64_t updateDim = std::get<1>(it);
     if (updateType.getDimSize(updateDim) >
         originalType.getDimSize(originalDim)) {
-      return op.emitOpError("indexed shape of update value dim#")
+      return op->emitOpError("indexed shape of update value dim#")
              << updateDim << " exceeds original value at dim#" << originalDim
              << " " << updateType.getDimSize(updateDim) << " "
              << originalType.getDimSize(originalDim);
     }
   }
 
-  Region &region = op.region();
+  Region &region = this->region();
   Block *body = &region.front();
   if (body->getNumArguments() != 2) {
-    return op.emitOpError("expected region to have two arguments");
+    return op->emitOpError("expected region to have two arguments");
   }
   Type arg0Type = body->getArgument(0).getType();
   Type arg1Type = body->getArgument(1).getType();
   if (!arg0Type.isIntOrFloat() || !arg1Type.isIntOrFloat()) {
-    return op.emitOpError(
+    return op->emitOpError(
         "expected region to have scalar argument of integer or float types");
   }
   if (arg0Type != updateType.getElementType()) {
-    return op.emitOpError("mismatch in argument 0 of region ")
+    return op->emitOpError("mismatch in argument 0 of region ")
            << arg0Type << " and element type of update value "
            << updateType.getElementType();
   }
   if (arg1Type != originalType.getElementType()) {
-    return op.emitOpError("mismatch in argument 1 of region ")
+    return op->emitOpError("mismatch in argument 1 of region ")
            << arg1Type << " and element type of original value "
            << originalType.getElementType();
   }
   if (arg0Type != arg1Type) {
-    return op.emitOpError("mismatch in region argument types ")
+    return op->emitOpError("mismatch in region argument types ")
            << arg0Type << " and " << arg1Type;
   }
   auto yieldOp = cast<IREE::LinalgExt::YieldOp>(body->getTerminator());
@@ -354,44 +355,45 @@
 // SortOp
 //===----------------------------------------------------------------------===//
 
-static LogicalResult verifySortOp(SortOp op) {
-  if (op.getNumInputs()) {
-    return op.emitOpError("does not expect to take any inputs");
+LogicalResult SortOp::verify() {
+  Operation *op = getOperation();
+  if (getNumInputs()) {
+    return op->emitOpError("does not expect to take any inputs");
   }
-  if (op.getNumOutputs() == 0) {
-    return op.emitOpError("expected at least one `outs` operand");
+  if (getNumOutputs() == 0) {
+    return op->emitOpError("expected at least one `outs` operand");
   }
 
-  Block &block = op.region().front();
-  size_t numOutputs = op.getNumOutputs();
+  Block &block = region().front();
+  size_t numOutputs = getNumOutputs();
   if (block.getNumArguments() != 2 * numOutputs) {
-    return op.emitOpError("region block should have ")
+    return op->emitOpError("region block should have ")
            << 2 * numOutputs << " arguments";
   }
 
-  int64_t rank = op.getOperandRank();
-  int sortDim = op.dimension();
+  int64_t rank = getOperandRank();
+  int sortDim = dimension();
   if (sortDim < 0 || sortDim >= rank) {
-    return op.emitOpError("dimension must be within (0, ") << rank << "]";
+    return op->emitOpError("dimension must be within (0, ") << rank << "]";
   }
 
-  ArrayRef<int64_t> shape = op.getOperandShape();
-  for (auto indexedOperand : llvm::enumerate(op.outputs())) {
+  ArrayRef<int64_t> shape = getOperandShape();
+  for (auto indexedOperand : llvm::enumerate(outputs())) {
     int index = indexedOperand.index();
-    auto operandType = op.getOperandType(index);
+    auto operandType = getOperandType(index);
     if (operandType.getRank() != rank) {
-      return op.emitOpError("expected operand ")
+      return op->emitOpError("expected operand ")
              << index << " to be rank " << rank << ", same as other operands";
     }
     if (operandType.getShape() != shape) {
-      return op.emitOpError("expected operand ")
+      return op->emitOpError("expected operand ")
              << index << " to have same shape as other operands";
     }
     Type elemType = operandType.getElementType();
     for (int i : {2 * index, 2 * index + 1}) {
       Type argType = block.getArgument(i).getType();
       if (argType != elemType) {
-        return op.emitOpError("region block argument #")
+        return op->emitOpError("region block argument #")
                << i << " should be of type " << elemType << " but got "
                << argType;
       }
@@ -400,11 +402,11 @@
 
   auto yieldOp = cast<YieldOp>(block.getTerminator());
   if (yieldOp.getNumOperands() != 1) {
-    return op.emitOpError("should yield exactly one operand");
+    return op->emitOpError("should yield exactly one operand");
   }
   auto ty = yieldOp.getOperand(0).getType().dyn_cast<IntegerType>();
   if (!ty || ty.getWidth() != 1) {
-    return op.emitOpError("should yield i1 type");
+    return op->emitOpError("should yield i1 type");
   }
 
   return success();
@@ -560,26 +562,28 @@
 // FftOp
 //===----------------------------------------------------------------------===//
 
-static LogicalResult verifyFftOp(FftOp op) {
-  auto length = op.getFftLength();
+LogicalResult FftOp::verify() {
+  Operation *op = getOperation();
+  auto length = getFftLength();
   // After tiling, it could be dynamic shape. (Because
   // subview/subtensor does not inference the type correctly
   // on (1 << x)) cases).
   if (length == ShapedType::kDynamicSize) return success();
   if (length & (length - 1)) {
-    return op.emitOpError("only powers of 2 are handled currently");
+    return op->emitOpError("only powers of 2 are handled currently");
   }
-  if (!op.getNumInputs() || !op.isScalar(op.getInputOperand(0))) {
-    return op.emitOpError("expected to carry `stage` input");
+  if (!getNumInputs() || !isScalar(getInputOperand(0))) {
+    return op->emitOpError("expected to carry `stage` input");
   }
-  if (op.getNumInputs() != 1) {
-    if (op.getNumInputs() != 3 || op.isScalar(op.getInputOperand(1)) ||
-        op.isScalar(op.getInputOperand(2))) {
-      return op.emitOpError("expected to carry real and imag coeff inputs");
+  if (getNumInputs() != 1) {
+    if (getNumInputs() != 3 || isScalar(getInputOperand(1)) ||
+        isScalar(getInputOperand(2))) {
+      return op->emitOpError("expected to carry real and imag coeff inputs");
     }
   }
-  if (op.getNumOutputs() != 2) {
-    return op.emitOpError("expected outputs to be real and imag tensor/memref");
+  if (getNumOutputs() != 2) {
+    return op->emitOpError(
+        "expected outputs to be real and imag tensor/memref");
   }
   return success();
 }
@@ -810,34 +814,35 @@
 // ScanOp
 //===----------------------------------------------------------------------===//
 
-static LogicalResult verifyScanOp(ScanOp op) {
-  if (op.getNumInputs() != 1) {
-    return op.emitOpError("expected one input operands");
+LogicalResult ScanOp::verify() {
+  Operation *op = getOperation();
+  if (getNumInputs() != 1) {
+    return op->emitOpError("expected one input operands");
   }
-  if (op.getNumOutputs() != 2) {
-    return op.emitOpError("expected two output operands");
+  if (getNumOutputs() != 2) {
+    return op->emitOpError("expected two output operands");
   }
-  if (!op.input().getType().isa<ShapedType>()) {
-    return op.emitOpError("expected first input element type to be shaped");
+  if (!input().getType().isa<ShapedType>()) {
+    return op->emitOpError("expected first input element type to be shaped");
   }
-  auto accumulatorType = op.accumulator().getType().cast<ShapedType>();
-  auto inputType = op.input().getType().cast<ShapedType>();
-  auto outputType = op.output().getType().cast<ShapedType>();
+  auto accumulatorType = accumulator().getType().cast<ShapedType>();
+  auto inputType = input().getType().cast<ShapedType>();
+  auto outputType = output().getType().cast<ShapedType>();
   ArrayRef<int64_t> inputShapes = inputType.getShape();
   ArrayRef<int64_t> outputShapes = outputType.getShape();
   if (accumulatorType.getElementType() != inputType.getElementType()) {
-    return op.emitOpError(
+    return op->emitOpError(
         "expected input/accumulator element types to be identical");
   }
   ArrayRef<int64_t> accumulatorShape = accumulatorType.getShape();
   int64_t accumulatorRank = accumulatorType.getRank();
   if (accumulatorRank != inputType.getRank() - 1) {
-    return op.emitOpError(
+    return op->emitOpError(
         "expected accumulator rank to be equal to input rank - 1");
   }
   SmallVector<int64_t> expectedAccumulatorShape;
   for (int i = 0; i < inputType.getRank(); i++) {
-    if (i != op.dimension()) expectedAccumulatorShape.push_back(inputShapes[i]);
+    if (i != dimension()) expectedAccumulatorShape.push_back(inputShapes[i]);
   }
   if (llvm::any_of(llvm::zip(expectedAccumulatorShape, accumulatorShape),
                    [](std::tuple<int64_t, int64_t> s) {
@@ -845,14 +850,14 @@
                             std::get<1>(s) != ShapedType::kDynamicSize &&
                             std::get<0>(s) != std::get<1>(s);
                    })) {
-    return op.emitOpError("incompatible input/accumulator shapes");
+    return op->emitOpError("incompatible input/accumulator shapes");
   }
   if (inputType.getElementType() != outputType.getElementType()) {
-    return op.emitOpError(
+    return op->emitOpError(
         "expected input/output element types to be identical");
   }
   if (inputShapes.size() != outputShapes.size()) {
-    return op.emitOpError("expected input/output to have identical ranks");
+    return op->emitOpError("expected input/output to have identical ranks");
   }
   if (llvm::any_of(llvm::zip(inputShapes, outputShapes),
                    [](std::tuple<int64_t, int64_t> s) {
@@ -860,7 +865,7 @@
                             std::get<1>(s) != ShapedType::kDynamicSize &&
                             std::get<0>(s) != std::get<1>(s);
                    })) {
-    return op.emitOpError("incompatible input/output shapes");
+    return op->emitOpError("incompatible input/output shapes");
   }
   return success();
 }
@@ -1042,23 +1047,24 @@
 // ReverseOp
 //===----------------------------------------------------------------------===//
 
-static LogicalResult verifyReverseOp(ReverseOp op) {
-  if (op.getNumInputs() != 1) {
-    return op.emitOpError("expected exactly one input");
+LogicalResult ReverseOp::verify() {
+  Operation *op = getOperation();
+  if (getNumInputs() != 1) {
+    return op->emitOpError("expected exactly one input");
   }
-  if (op.getNumOutputs() != 1) {
-    return op.emitOpError("expected exactly one output");
+  if (getNumOutputs() != 1) {
+    return op->emitOpError("expected exactly one output");
   }
-  auto inputType = op.input().getType().cast<ShapedType>();
-  auto outputType = op.output().getType().cast<ShapedType>();
+  auto inputType = input().getType().cast<ShapedType>();
+  auto outputType = output().getType().cast<ShapedType>();
   if (inputType.getElementType() != outputType.getElementType()) {
-    return op.emitOpError(
+    return op->emitOpError(
         "expected input/output element types to be identical");
   }
   ArrayRef<int64_t> inputShapes = inputType.getShape();
   ArrayRef<int64_t> outputShapes = outputType.getShape();
   if (inputShapes.size() != outputShapes.size()) {
-    return op.emitOpError("expexted input/output to have identical ranks");
+    return op->emitOpError("expexted input/output to have identical ranks");
   }
   if (llvm::any_of(llvm::zip(inputShapes, outputShapes),
                    [](std::tuple<int64_t, int64_t> s) {
@@ -1066,18 +1072,18 @@
                             std::get<1>(s) != ShapedType::kDynamicSize &&
                             std::get<0>(s) != std::get<1>(s);
                    })) {
-    return op.emitOpError("incompatible input/output shapes");
+    return op->emitOpError("incompatible input/output shapes");
   }
 
-  int64_t rank = op.getOperandRank();
+  int64_t rank = getOperandRank();
   llvm::SmallSetVector<int64_t, 4> s;
-  for (auto dim : op.dims()) {
+  for (auto dim : dims()) {
     if (dim < 0 || dim >= rank) {
-      return op.emitOpError("all the dimensions must be within [0, ")
+      return op->emitOpError("all the dimensions must be within [0, ")
              << rank << ")";
     }
     if (s.contains(dim)) {
-      return op.emitOpError("expected dimensions numbers are all unique");
+      return op->emitOpError("expected dimensions numbers are all unique");
     }
     s.insert(dim);
   }
diff --git a/llvm-external-projects/iree-dialects/lib/Dialect/PyDM/IR/PyDMDialect.cpp b/llvm-external-projects/iree-dialects/lib/Dialect/PyDM/IR/PyDMDialect.cpp
index 1915da7..82381bc 100644
--- a/llvm-external-projects/iree-dialects/lib/Dialect/PyDM/IR/PyDMDialect.cpp
+++ b/llvm-external-projects/iree-dialects/lib/Dialect/PyDM/IR/PyDMDialect.cpp
@@ -33,7 +33,10 @@
 using PyBoolType = PYDM::BoolType;
 using PyConstantOp = PYDM::ConstantOp;
 using PyIntegerType = PYDM::IntegerType;
+using PyListType = PYDM::ListType;
 using PyRealType = PYDM::RealType;
+using PyObjectType = PYDM::ObjectType;
+using PyUnionType = PYDM::UnionType;
 
 void IREEPyDMDialect::initialize() {
   addTypes<
@@ -115,6 +118,49 @@
   return emitError() << "unsupported python integer bit width: " << w;
 }
 
+Type PyIntegerType::parse(mlir::AsmParser &parser) {
+  MLIRContext *ctxt = parser.getContext();
+  auto emitError = [&]() -> InFlightDiagnostic {
+    return parser.emitError(parser.getCurrentLocation());
+  };
+  // Weak
+  if (failed(parser.parseOptionalLess())) return get(ctxt);
+  // AP
+  if (succeeded(parser.parseOptionalStar())) {
+    if (failed(parser.parseGreater())) return Type();
+    return get(ctxt, None);
+  }
+
+  // Explicit
+  bool isSigned;
+  if (succeeded(parser.parseOptionalKeyword("unsigned"))) {
+    isSigned = false;
+  } else {
+    isSigned = true;
+  }
+
+  int width;
+  if (failed(parser.parseInteger(width))) return Type();
+  if (failed(parser.parseGreater())) return Type();
+  if (!isSigned) width = -width;
+  return getChecked(emitError, ctxt, width);
+}
+
+void PyIntegerType::print(mlir::AsmPrinter &printer) const {
+  auto w = getImpl()->bitWidth;
+  if (w) {
+    printer << "<";
+    if (*w == 0) {
+      printer << "*";
+    } else if (*w > 0) {
+      printer << *w;
+    } else {
+      printer << "unsigned " << (-*w);
+    }
+    printer << ">";
+  }
+}
+
 BuiltinTypeCode PYDM::IntegerType::getTypeCode() const {
   return static_cast<BuiltinTypeCode>(
       makeNumericTypeCode(*getNumericCategory(), *getNumericSubTypeCode()));
@@ -170,6 +216,57 @@
 }
 
 // ListType
+void PyListType::print(mlir::AsmPrinter &printer) const {
+  if (getImpl()->uniformElementType ||
+      getImpl()->storageClass != CollectionStorageClass::Boxed) {
+    printer << "<";
+    switch (getImpl()->storageClass) {
+      case CollectionStorageClass::Boxed:
+        printer << "boxed";
+        break;
+      case CollectionStorageClass::Empty:
+        printer << "empty";
+        break;
+      case CollectionStorageClass::Unboxed:
+        printer << "unboxed";
+        break;
+    }
+
+    if (getImpl()->uniformElementType) {
+      printer << ",";
+      printer << getImpl()->uniformElementType;
+    }
+    printer << ">";
+  }
+}
+
+Type PyListType::parse(mlir::AsmParser &parser) {
+  MLIRContext *ctxt = parser.getContext();
+  if (parser.parseOptionalLess())
+    return get(ctxt, CollectionStorageClass::Boxed, nullptr);
+
+  Type t;
+  StringRef storageClassKeyword;
+  if (parser.parseKeyword(&storageClassKeyword)) return Type();
+  if (parser.parseComma()) return Type();
+  if (parser.parseType(t)) return Type();
+  if (parser.parseGreater()) return Type();
+
+  CollectionStorageClass storageClass;
+  if (storageClassKeyword == "boxed")
+    storageClass = CollectionStorageClass::Boxed;
+  else if (storageClassKeyword == "empty")
+    storageClass = CollectionStorageClass::Empty;
+  else if (storageClassKeyword == "unboxed")
+    storageClass = CollectionStorageClass::Unboxed;
+  else {
+    parser.emitError(parser.getCurrentLocation(),
+                     "expected one of 'boxed', 'empty', 'unboxed'");
+    return Type();
+  }
+  return get(ctxt, storageClass, t);
+}
+
 StringRef PYDM::ListType::getPythonTypeName() const { return "list"; }
 
 BuiltinTypeCode PYDM::NoneType::getTypeCode() const {
@@ -206,6 +303,26 @@
 StringRef PYDM::NoneType::getPythonTypeName() const { return "None"; }
 
 // ObjectType
+void PyObjectType::print(mlir::AsmPrinter &printer) const {
+  if (getImpl()->primitiveType)
+    printer << "<" << getImpl()->primitiveType << ">";
+}
+
+Type PyObjectType::parse(mlir::AsmParser &parser) {
+  MLIRContext *ctxt = parser.getContext();
+  if (parser.parseOptionalLess()) return get(ctxt, nullptr);
+
+  Type t;
+  if (parser.parseType(t)) return Type();
+  if (parser.parseGreater()) return Type();
+  if (auto primitiveType = t.dyn_cast<PrimitiveType>())
+    return get(ctxt, primitiveType);
+  else {
+    parser.emitError(parser.getNameLoc(), "expected a primitive type");
+    return Type();
+  }
+}
+
 BuiltinTypeCode PYDM::ObjectType::getTypeCode() const {
   return BuiltinTypeCode::Object;
 }
@@ -222,6 +339,26 @@
 }
 
 // RealType
+void PyRealType::print(mlir::AsmPrinter &printer) const {
+  auto ft = getImpl()->floatType;
+  if (ft) printer << "<" << ft << ">";
+}
+
+Type PyRealType::parse(mlir::AsmParser &parser) {
+  MLIRContext *ctxt = parser.getContext();
+
+  auto emitError = [&]() -> InFlightDiagnostic {
+    return parser.emitError(parser.getCurrentLocation());
+  };
+  // Weak
+  if (failed(parser.parseOptionalLess())) return get(ctxt);
+  // Explicit
+  FloatType subType;
+  if (failed(parser.parseType(subType))) return Type();
+  if (failed(parser.parseGreater())) return Type();
+  return getChecked(emitError, ctxt, subType);
+}
+
 LogicalResult PYDM::RealType::verify(
     function_ref<InFlightDiagnostic()> emitError, FloatType floatType) {
   if (!floatType) return success();
@@ -295,6 +432,26 @@
 // Union type implementation
 //------------------------------------------------------------------------------
 
+void PyUnionType::print(mlir::AsmPrinter &printer) const {
+  llvm::interleaveComma(getAlternatives(), printer);
+}
+
+Type PyUnionType::parse(mlir::AsmParser &parser) {
+  MLIRContext *ctxt = parser.getContext();
+  if (parser.parseOptionalLess()) return get(ctxt, {});
+
+  SmallVector<::mlir::Type> alternatives;
+
+  do {
+    Type type;
+    if (parser.parseType(type)) return Type();
+    alternatives.push_back(type);
+  } while (succeeded(parser.parseOptionalComma()));
+
+  return getChecked([&]() { return parser.emitError(parser.getNameLoc()); },
+                    ctxt, alternatives);
+}
+
 LogicalResult PYDM::UnionType::verify(
     llvm::function_ref<InFlightDiagnostic()> emitError,
     ArrayRef<Type> alternatives) {
diff --git a/llvm-external-projects/iree-dialects/lib/Dialect/PyDM/IR/PyDMOps.cpp b/llvm-external-projects/iree-dialects/lib/Dialect/PyDM/IR/PyDMOps.cpp
index cbb07b8..2010688 100644
--- a/llvm-external-projects/iree-dialects/lib/Dialect/PyDM/IR/PyDMOps.cpp
+++ b/llvm-external-projects/iree-dialects/lib/Dialect/PyDM/IR/PyDMOps.cpp
@@ -29,8 +29,6 @@
 using PyCallOp = PYDM::CallOp;
 using PyFuncOp = PYDM::FuncOp;
 
-static LogicalResult verify(Operation *) { return success(); }
-
 //===----------------------------------------------------------------------===//
 // Utilities
 //===----------------------------------------------------------------------===//
@@ -439,9 +437,9 @@
 
 ::llvm::StringRef FunctionalIfOp::getDefaultDialect() { return "iree_pydm"; }
 
-static LogicalResult verify(FunctionalIfOp op) {
-  if (op.getNumResults() != 0 && op.elseRegion().empty())
-    return op.emitOpError("must have an else block if defining values");
+LogicalResult FunctionalIfOp::verify() {
+  if (getNumResults() != 0 && elseRegion().empty())
+    return emitOpError("must have an else block if defining values");
   return success();
 }
 
@@ -562,39 +560,34 @@
       p, *this, fnType.getInputs(), /*isVariadic=*/false, fnType.getResults());
 }
 
-static LogicalResult verify(PyFuncOp op) {
-  // TODO: Enforce invariants.
-  return success();
-}
-
 //===----------------------------------------------------------------------===//
 // MakeListOp
 //===----------------------------------------------------------------------===//
 
-static LogicalResult verify(MakeListOp op) {
-  auto listType = op.list().getType().cast<ListType>();
+LogicalResult MakeListOp::verify() {
+  auto listType = list().getType().cast<ListType>();
   switch (listType.getStorageClass()) {
     case CollectionStorageClass::Boxed:
-      for (auto element : op.elements()) {
+      for (auto element : elements()) {
         if (!element.getType().isa<ObjectType>()) {
-          return op.emitOpError() << "making a list with boxed storage class "
-                                     "must have object elements. Got: "
-                                  << element.getType();
+          return emitOpError() << "making a list with boxed storage class "
+                                  "must have object elements. Got: "
+                               << element.getType();
         }
       }
       break;
     case CollectionStorageClass::Unboxed:
-      for (auto element : op.elements()) {
+      for (auto element : elements()) {
         if (element.getType().isa<ObjectType>()) {
-          return op.emitOpError() << "making a list with unboxed storage class "
-                                     "must not have object elements. Got: "
-                                  << element.getType();
+          return emitOpError() << "making a list with unboxed storage class "
+                                  "must not have object elements. Got: "
+                               << element.getType();
         }
       }
       break;
     case CollectionStorageClass::Empty:
-      if (!op.elements().empty()) {
-        return op.emitOpError()
+      if (!elements().empty()) {
+        return emitOpError()
                << "making a list with empty storage class must have zero "
                   "elements";
       }
diff --git a/third_party/llvm-project b/third_party/llvm-project
index 8361c5d..e9c9ee9 160000
--- a/third_party/llvm-project
+++ b/third_party/llvm-project
@@ -1 +1 @@
-Subproject commit 8361c5da30588d3d4a48eae648f53be1feb5cfad
+Subproject commit e9c9ee9fe694067ee96643d05d6ac378349386bb
diff --git a/third_party/mlir-hlo b/third_party/mlir-hlo
index 7727bff..57288f1 160000
--- a/third_party/mlir-hlo
+++ b/third_party/mlir-hlo
@@ -1 +1 @@
-Subproject commit 7727bfff1a219c9cd60087a1ae0a4b7e52916f57
+Subproject commit 57288f12595a2ee0488806672a42da59b1e56e13