Merge pull request #5018 from ThomasRaoux:main-to-google PiperOrigin-RevId: 361672157

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 99f70ed..9e29487 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt

@@ -13,11 +13,6 @@
 # limitations under the License.
 
 cmake_minimum_required(VERSION 3.13.4)
-# Allow target_link_libraries() from other directories (since 3.13):
-#   https://cmake.org/cmake/help/v3.13/policy/CMP0079.html
-if(POLICY CMP0079)
-  cmake_policy(SET CMP0079 NEW)
-endif()
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 project(iree CXX C)

diff --git a/SUBMODULE_VERSIONS.txt b/SUBMODULE_VERSIONS.txt
index 301573f..ef5d094 100644
--- a/SUBMODULE_VERSIONS.txt
+++ b/SUBMODULE_VERSIONS.txt

@@ -6,7 +6,7 @@
 88b845dee001723c4a0db1fe5477de735b6d3bb0 third_party/liburing
 a31f503b4e0d22be4ee31b03f09a3f563e72f5e3 third_party/llvm-bazel
 c9ff39a3f9840c84453f23a37386a3dc374f055a third_party/llvm-project
-4e501d8c6e2d834999301a2492adefe5ddbdc0cb third_party/mlir-emitc
+b57346cdc50296c0c498aaf20b116b0ff23cb68c third_party/mlir-emitc
 55eda81407508a3391aa7d875515263dfe6044ee third_party/mlir-hlo
 2b2bd45bbf9be04fd22ece5cc1f54679202e9257 third_party/pffft
 d8c7ee00a687ac369e62e2032514a93a9b413502 third_party/pybind11

diff --git a/build_tools/bazel/build_core.sh b/build_tools/bazel/build_core.sh
index b7cd47c..4c7a54a 100755
--- a/build_tools/bazel/build_core.sh
+++ b/build_tools/bazel/build_core.sh

@@ -48,6 +48,9 @@
 declare -a default_build_tag_filters=("-nokokoro")
 declare -a default_test_tag_filters=("-nokokoro" "-driver=metal")
 
+# CUDA CI testing disabled until we setup a target for it.
+default_test_tag_filters+=("-driver=cuda")
+
 if [[ "${IREE_VULKAN_DISABLE?}" == 1 ]]; then
   default_test_tag_filters+=("-driver=vulkan")
 fi

diff --git a/build_tools/buildkite/cmake/android/arm64-v8a/benchmark.yml b/build_tools/buildkite/cmake/android/arm64-v8a/benchmark.yml
index d36bc46..ecf516a 100644
--- a/build_tools/buildkite/cmake/android/arm64-v8a/benchmark.yml
+++ b/build_tools/buildkite/cmake/android/arm64-v8a/benchmark.yml

@@ -28,7 +28,7 @@
 
   - wait
 
-  - label: "benchmark on snapdragon-855 (adreno-640) (Pixel 4)"
+  - label: "benchmark on Pixel 4 (snapdragon-855, adreno-640)"
     commands:
       - "buildkite-agent artifact download --step build model-artifacts.tgz ./"
       - "tar xzvf model-artifacts.tgz"
@@ -47,7 +47,7 @@
     artifact_paths: "mako-*.log"
     timeout_in_minutes: "15"
 
-  - label: "benchmark on exynos-990 (S20)"
+  - label: "benchmark on Galaxy S20 (exynos-990, mali-g77)"
     commands:
       - "buildkite-agent artifact download --step build model-artifacts.tgz ./"
       - "tar xzvf model-artifacts.tgz"
@@ -59,7 +59,7 @@
       - "adb shell rm -rf /data/local/tmp/benchmark_tmpdir"
     agents:
       - "android-soc=exynos-990"
-      - "android-version=10"
+      - "android-version=11"
       - "queue=benchmark-android"
     env:
       IREE_DOCKER_WORKDIR: "/usr/src/github/iree"

diff --git a/build_tools/buildkite/cmake/android/arm64-v8a/pipeline.yml b/build_tools/buildkite/cmake/android/arm64-v8a/pipeline.yml
index 8bd3864..0cacf62 100644
--- a/build_tools/buildkite/cmake/android/arm64-v8a/pipeline.yml
+++ b/build_tools/buildkite/cmake/android/arm64-v8a/pipeline.yml

@@ -25,8 +25,7 @@
 
   - wait
 
-  - label: "test on exynos-990 (mali-g77)"
-    skip: true
+  - label: "test on Galaxy S20 (exynos-990, mali-g77)"
     commands:
       - "buildkite-agent artifact download --step build build-artifacts.tgz ./"
       - "tar xzf build-artifacts.tgz"
@@ -34,13 +33,13 @@
       - "cd build-android/"
       - "ctest --output-on-failure"
     agents:
-      - "test-android=true"
       - "android-soc=exynos-990"
+      - "queue=test-android"
     env:
       IREE_DOCKER_WORKDIR: "/usr/src/github/iree"
     timeout_in_minutes: "15"
 
-  - label: "test on exynos-9820 (mali-g76)"
+  - label: "test on Galaxy S10 (exynos-9820, mali-g76)"
     commands:
       - "buildkite-agent artifact download --step build build-artifacts.tgz ./"
       - "tar xzf build-artifacts.tgz"
@@ -48,14 +47,13 @@
       - "cd build-android/"
       - "ctest --output-on-failure"
     agents:
-      - "test-android=true"
       - "android-soc=exynos-9820"
+      - "queue=test-android"
     env:
       IREE_DOCKER_WORKDIR: "/usr/src/github/iree"
     timeout_in_minutes: "15"
 
-  - label: "test on snapdragon-835 (adreno-540)"
-    skip: true
+  - label: "test on Pixel 4 (snapdragon-855, adreno-640)"
     commands:
       - "buildkite-agent artifact download --step build build-artifacts.tgz ./"
       - "tar xzf build-artifacts.tgz"
@@ -63,59 +61,8 @@
       - "cd build-android/"
       - "ctest --output-on-failure"
     agents:
-      - "test-android=true"
-      - "android-soc=snapdragon-835"
-    env:
-      IREE_DOCKER_WORKDIR: "/usr/src/github/iree"
-    timeout_in_minutes: "15"
-    soft_fail:
-      - exit_status: "*"
-
-  - label: "test on snapdragon-855 (adreno-640)"
-    # TODO(#4861): Re-enable when phone is fixed
-    skip: "Phone is borked. See https://github.com/google/iree/issues/4861"
-    commands:
-      - "buildkite-agent artifact download --step build build-artifacts.tgz ./"
-      - "tar xzf build-artifacts.tgz"
-      - "find build-android/ -name '*.cmake' -exec sed -i \"s!\\$IREE_DOCKER_WORKDIR/!\\$PWD/!g\" {} \\;"
-      - "cd build-android/"
-      - "ctest --output-on-failure"
-    agents:
-      - "test-android=true"
       - "android-soc=snapdragon-855"
-      - "android-version=10"
-    env:
-      IREE_DOCKER_WORKDIR: "/usr/src/github/iree"
-    timeout_in_minutes: "15"
-
-  - label: "test on snapdragon-855 (adreno-640) (Android 11)"
-    skip: true
-    commands:
-      - "buildkite-agent artifact download --step build build-artifacts.tgz ./"
-      - "tar xzf build-artifacts.tgz"
-      - "find build-android/ -name '*.cmake' -exec sed -i \"s!\\$IREE_DOCKER_WORKDIR/!\\$PWD/!g\" {} \\;"
-      - "cd build-android/"
-      - "ctest --output-on-failure"
-    agents:
-      - "test-android=true"
-      - "android-soc=snapdragon-855"
-      - "android-version=11"
-    env:
-      IREE_DOCKER_WORKDIR: "/usr/src/github/iree"
-    branches: "main"
-    timeout_in_minutes: "20"
-    soft_fail: true
-
-  - label: "test on snapdragon-865 (adreno-650)"
-    commands:
-      - "buildkite-agent artifact download --step build build-artifacts.tgz ./"
-      - "tar xzf build-artifacts.tgz"
-      - "find build-android/ -name '*.cmake' -exec sed -i \"s!\\$IREE_DOCKER_WORKDIR/!\\$PWD/!g\" {} \\;"
-      - "cd build-android/"
-      - "ctest --output-on-failure"
-    agents:
-      - "test-android=true"
-      - "android-soc=snapdragon-865"
+      - "queue=test-android"
     env:
       IREE_DOCKER_WORKDIR: "/usr/src/github/iree"
     timeout_in_minutes: "15"

diff --git a/build_tools/cmake/test.sh b/build_tools/cmake/test.sh
index b38df1c..dc3ab5c 100755
--- a/build_tools/cmake/test.sh
+++ b/build_tools/cmake/test.sh

@@ -28,6 +28,8 @@
 # and turning on the llvmaot ones.
 export IREE_VULKAN_DISABLE=${IREE_VULKAN_DISABLE:-0}
 export IREE_LLVMAOT_DISABLE=${IREE_LLVMAOT_DISABLE:-0}
+# CUDA is off by default.
+export IREE_CUDA_DISABLE=${IREE_CUDA_DISABLE:-1}
 
 # Tests to exclude by label. In addition to any custom labels (which are carried
 # over from Bazel tags), every test should be labeled with the directory it is
@@ -57,6 +59,9 @@
 if [[ "${IREE_LLVMAOT_DISABLE?}" == 1 ]]; then
   label_exclude_args+=("^driver=dylib$")
 fi
+if [[ "${IREE_CUDA_DISABLE?}" == 1 ]]; then
+  label_exclude_args+=("^driver=cuda$")
+fi
 
 # Join on "|"
 label_exclude_regex="($(IFS="|" ; echo "${label_exclude_args[*]?}"))"

diff --git a/docs/design_docs/cuda_backend.md b/docs/design_docs/cuda_backend.md
new file mode 100644
index 0000000..5b84c25
--- /dev/null
+++ b/docs/design_docs/cuda_backend.md

@@ -0,0 +1,102 @@
+# IREE CUDA backend
+
+This document is intended to provide an overview of the design choices made to support CUDA within IREE. It describes both the HAL runtime and the NVVM codegen side.
+
+## CUDA HAL Driver
+
+The CUDA HAL driver is in [`iree/hal/cuda/`][iree-cuda] directory. It is written in C following the standards of the rest of the HAL module.
+
+### CUDA library dependency
+
+IREE calls directly into [`CUDA driver API`][cuda-driver]. CUDA library is loaded dynamically and cuda.h header from CUDA SDK is part of IREE third_party project. Therefore IREE doesn't require CUDA SDK to be installed when building iree tools.
+At runtime HAL CUDA driver will load libcuda.so/nvcuda.dll library and load a subset of the cuda driver API used in HAL. The list of functions being used are in the file [`iree/hal/cuda/dynamic_symbols_tables.h`][cuda-symbols]
+
+### Driver
+
+There is no direct equivalent in CUDA to the HAL driver abstraction. We use it to hold the symbols loaded for all the devices.
+
+### Device
+
+The equivalent to HAL device in CUDA is the `CUcontext`, it holds all the state related to memory allocations.
+
+### Command buffer
+
+We implement command buffers using [`CUDA Graph API`][cuda-graph]. Using the Graph API allows to easily encode fine grain dependencies between dispatch without having to create multiple streams.
+Note that Graph API is meant to be used for command buffers that can be recorded once and used several times and there may be a performance penalty to using Graph API for direct command buffer. It is likely that we will also have a pure stream implementation in the future if we see performance problems with direct command buffer usages.
+
+### Event and Barrier
+
+In HAL Event and Barrier are used for GPU<->GPU synchronization either within a command buffer (Event and Barrier) or between command buffers.
+
+The current implementation ignores events and barriers and serializes all the nodes of the graph in order to have a conservative but correct solution.
+
+The design we plan for the future is to map dependencies within a command buffer to graph dependencies in the CUDA Graph API. When an event is signaled all the leaf nodes of the graph will be saved in HAL data structure and when the same command buffer waits on the signal we will add all the nodes as dependency to the future nodes added to the graph.
+
+For simplicity we always serialize command buffers sent to the same command queue.
+
+### Allocator
+
+The allocator will forward allocation requests to `cuMemHostAlloc` for host accessible memory and `cuMemAlloc` for device only memory.
+
+### Buffer
+
+CUDA buffers are represented either as a host pointer or a device pointer of type `CUdeviceptr`.
+
+### Executable
+
+HAL executable maps naturally to a PTX module. The compiler will generate a flat buffer containing a PTX text module as well as a list of entry point function names and the workgroup size associated with those entry points.
+
+### Semaphore
+
+Timeline semaphore is used in IREE to handle coarse grain synchronization for CPU<->GPU, GPU<->GPU and CPU<->CPU. The interface follows closely [`Vulkan timeline semaphore spec`][vulkan-semaphore].
+There is currently no simple way to implement this on CUDA. There are several solutions discussed on this [`IREE issue`][semaphore-issue] but no obvious solution. For now we force CPU and GPU to be synchronized after every submit to ensure correctness and ignore the semaphore.
+
+## NVVM Codegen
+
+### NVVM and PTX
+
+NVVM is a CUDA specific IR composed of LLVM IR and NVVM specific intrinsics. It can be compiled to PTX text using LLVM PTX backend. NVVM has an associated dialect in MLIR that translates 1:1 to NVVM intrinsics. This is what we are using to generate the PTX kernel code.
+
+### IREE flow
+
+IREE's [`target independent codegen`][codegen-passes] converts the compiler input to Linalg on Tensors. Afterward IREE will call the LinalgToNVVM codegen passes.
+Note that IREE had a legacy mode generating Linalg on Buffers. It is not supported by this path.
+
+Once we get into LinalgToNNVM passes we first do bufferize to generate Linalg on Buffers. Then we apply MLIR generic passes to  convert linalg to SCF dialect and then SCF to Standard dialect. After that we convert Standard dialect to LLVM+NVVM dialect.
+
+## Example
+
+Save the following mlir in /tmp/add.mlir
+```mlir
+func @add(%lhs: tensor<4xf32>, %rhs: tensor<4xf32>) -> tensor<4xf32>
+ attributes { iree.module.export } {
+ %0 = "mhlo.add"(%lhs, %rhs) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+ return %0 : tensor<4xf32>
+}
+```
+
+```shell
+# First translate into a VM bytecode module using linalg on tensors path.
+$ ../iree-build/iree/tools/iree-translate \
+ -iree-mlir-to-vm-bytecode-module \
+ --iree-hal-target-backends=cuda \
+ -iree-flow-dispatch-linalg-on-tensors \
+ /tmp/add.mlir \
+ -o /tmp/mhlo-add.vmfb
+
+# Run the module through CUDA HAL backend.
+$ ../iree-build/iree/tools/iree-run-module \
+-module_file=/tmp/mhlo-add.vmfb -driver=cuda -entry_function=add \
+--function_inputs='4xf32=[1 2 3 4], 4xf32=[2 2 2 2]'
+
+EXEC @add
+4xf32=3 4 5 6
+```
+
+[iree-cuda]: https://github.com/google/iree/tree/main/iree/hal/cuda
+[cuda-symbols]: https://github.com/google/iree/blob/main/iree/hal/cuda/dynamic_symbols_tables.h
+[cuda-driver]: https://docs.nvidia.com/cuda/cuda-driver-api/index.html
+[cuda-graph]: https://developer.nvidia.com/blog/cuda-graphs/
+[vulkan-semaphore]: https://www.khronos.org/blog/vulkan-timeline-semaphores
+[semaphore-issue]: https://github.com/google/iree/issues/4727
+[codegen-passes]: https://github.com/google/iree/blob/main/docs/design_docs/codegen_passes.md

diff --git a/docs/get_started/getting_started_linux_cmake.md b/docs/get_started/getting_started_linux_cmake.md
index c55774d..43f11e1 100644
--- a/docs/get_started/getting_started_linux_cmake.md
+++ b/docs/get_started/getting_started_linux_cmake.md

@@ -19,16 +19,16 @@
 
 ### Install CMake
 
-IREE uses CMake version `>= 3.13`. First try installing via your distribution's
-package manager and verify the version:
+IREE uses CMake version `>= 3.13.4`. First try installing via your
+distribution's package manager and verify the version:
 
 ```shell
 $ sudo apt install cmake
-$ cmake --version # >= 3.13
+$ cmake --version # >= 3.13.4
 ```
 
 Some package managers (like `apt`) distribute old versions of cmake. If your
-package manager installs a version `< 3.13`, then follow the installation
+package manager installs a version `< 3.13.4`, then follow the installation
 instructions [here](https://cmake.org/install/) to install a newer version (e.g.
 the latest).
 

diff --git a/docs/get_started/getting_started_macos_cmake.md b/docs/get_started/getting_started_macos_cmake.md
index d470281..e15745b 100644
--- a/docs/get_started/getting_started_macos_cmake.md
+++ b/docs/get_started/getting_started_macos_cmake.md

@@ -31,12 +31,12 @@
 
 ### Install CMake
 
-IREE uses [CMake](https://cmake.org/) version `>= 3.13`. Brew ships the latest
+IREE uses [CMake](https://cmake.org/) version `>= 3.13.4`. Brew ships the latest
 release.
 
 ```shell
 $ brew install cmake
-$ cmake --version  # >= 3.13
+$ cmake --version  # >= 3.13.4
 ```
 
 ### Install Ninja

diff --git a/docs/get_started/getting_started_windows_cmake.md b/docs/get_started/getting_started_windows_cmake.md
index 04a48e0..bb88488 100644
--- a/docs/get_started/getting_started_windows_cmake.md
+++ b/docs/get_started/getting_started_windows_cmake.md

@@ -19,7 +19,7 @@
 
 ### Install CMake
 
-Install CMake version >= 3.13 from the
+Install CMake version >= 3.13.4 from the
 [downloads page](https://cmake.org/download/).
 
 > Tip:<br>

diff --git a/experimental/bindings/java/build.gradle b/experimental/bindings/java/build.gradle
index ec926d7..afd4e32 100644
--- a/experimental/bindings/java/build.gradle
+++ b/experimental/bindings/java/build.gradle

@@ -54,7 +54,7 @@
 
     externalNativeBuild {
         cmake {
-            version "3.13.0+"
+            version "3.13.4.0+"
             path "../../../CMakeLists.txt"
         }
     }

diff --git a/experimental/runners/CMakeLists.txt b/experimental/runners/CMakeLists.txt
new file mode 100644
index 0000000..34e4344
--- /dev/null
+++ b/experimental/runners/CMakeLists.txt

@@ -0,0 +1,32 @@
+cmake_minimum_required(VERSION 3.18)
+project(linalg-opt CXX)
+
+find_package(MLIR REQUIRED CONFIG)
+list(APPEND CMAKE_MODULE_PATH ${MLIR_DIR})
+list(APPEND CMAKE_MODULE_PATH ${MLIR_DIR}/../llvm)
+include(TableGen)
+include(AddLLVM)
+include(AddMLIR)
+
+include_directories(${LLVM_INCLUDE_DIRS})
+include_directories(${MLIR_INCLUDE_DIRS})
+link_directories(${LLVM_BUILD_LIBRARY_DIR})
+add_definitions(${LLVM_DEFINITIONS})
+
+get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
+get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS)
+
+set(LIBS
+  ${dialect_libs}
+  ${conversion_libs}
+  MLIROptLib
+)
+
+add_llvm_executable(linalg-opt
+  mlir-opt.cpp
+  LinalgTileToGeneric.cpp
+  LinalgComprehensiveBufferizePass.cpp
+  LinalgTensorCodegenStrategy.cpp
+)
+target_link_libraries(linalg-opt PRIVATE ${LIBS})
+mlir_check_all_link_libraries(linalg-opt)

diff --git a/experimental/runners/LinalgComprehensiveBufferizePass.cpp b/experimental/runners/LinalgComprehensiveBufferizePass.cpp
new file mode 100644
index 0000000..100c7b9
--- /dev/null
+++ b/experimental/runners/LinalgComprehensiveBufferizePass.cpp

@@ -0,0 +1,1378 @@
+//===- LinalgSimpleBufferizePass.cpp - Bufferize Linalg on tensors --------===//
+//
+// Convert from Linalg ops on tensors to Linalg ops on buffers in a single pass.
+// This will aggressively try to perform inplace bufferization and will fail if
+// any allocation tries to cross function boundaries or if the pattern
+// tensor_load(tensor_memref(x)) is deemed unsafe (very conservative impl for
+// now).
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
+#include "mlir/Dialect/SCF/Passes.h"
+#include "mlir/Dialect/SCF/SCF.h"
+#include "mlir/Dialect/Shape/Transforms/Passes.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/Dialect/StandardOps/Transforms/Passes.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Tensor/Transforms/Passes.h"
+#include "mlir/Dialect/Vector/VectorOps.h"
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Dominance.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Interfaces/CallInterfaces.h"
+#include "mlir/Interfaces/ControlFlowInterfaces.h"
+#include "mlir/Interfaces/LoopLikeInterface.h"
+#include "mlir/Interfaces/ViewLikeInterface.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/Passes.h"
+
+#define DEBUG_TYPE "linalg-comprehensive-bufferize-inplace"
+
+using namespace mlir;
+using namespace linalg;
+
+#define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ")
+
+namespace {
+struct LinalgComprehensiveBufferizePass
+    : public PassWrapper<LinalgComprehensiveBufferizePass,
+                         OperationPass<ModuleOp>> {
+  LinalgComprehensiveBufferizePass()
+      : enablingPassPipeline(OpPassManager("func")) {
+    enablingPassPipeline.addPass(createCanonicalizerPass());
+    enablingPassPipeline.addPass(createCSEPass());
+    enablingPassPipeline.addPass(createLoopInvariantCodeMotionPass());
+  }
+  LinalgComprehensiveBufferizePass(const LinalgComprehensiveBufferizePass &pass)
+      : enablingPassPipeline(pass.enablingPassPipeline) {}
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<LinalgDialect, scf::SCFDialect, StandardOpsDialect>();
+  }
+
+  void runOnOperation() override;
+
+  void runEnablingTransforms(FuncOp funcOp);
+  void bufferizeFuncOpInternals(FuncOp funcOp);
+
+  Option<bool> disableInPlace{
+      *this, "disable-inplace",
+      llvm::cl::desc(
+          "Disables inplace buferization. This is for testing purposes."),
+      llvm::cl::init(false)};
+
+  /// Dynamic pass pipeline of transformations that enable better inplace
+  /// bufferization.
+  OpPassManager enablingPassPipeline;
+};
+}  // namespace
+
+//===----------------------------------------------------------------------===//
+// Bufferization-specific attribute manipulation.
+//===----------------------------------------------------------------------===//
+
+/// Attribute marker to specify operands that can be bufferized inplace.
+constexpr StringLiteral kInPlaceAttrName = "__inplace_attr__";
+/// Attribute marker to specify results that fold onto input arguments.
+constexpr StringLiteral kResultFoldArgAttrName = "__result_fold_arg_attr__";
+
+// default clause
+enum class InPlaceSpec {
+  False,
+  True,
+  None,
+};
+
+static StringRef stringify(InPlaceSpec val) {
+  switch (val) {
+    case InPlaceSpec::False:
+      return "false";
+    case InPlaceSpec::True:
+      return "true";
+    case InPlaceSpec::None:
+      return "none";
+  }
+  return "";
+}
+
+static Optional<InPlaceSpec> symbolize(StringRef str) {
+  return StringSwitch<Optional<InPlaceSpec>>(str)
+      .Case("false", InPlaceSpec::False)
+      .Case("true", InPlaceSpec::True)
+      .Case("none", InPlaceSpec::None)
+      .Default(None);
+}
+
+/// Set the attribute entry `kInPlaceAttrName`@`idx` to `inplace`.
+/// If the attribute does not exist yet, add a blanket array attribute filled
+/// with InPlaceSpec::None before setting `kInPlaceAttrName`@`idx` to `inplace`.
+static void setInplace(Operation *op, unsigned idx = 0,
+                       InPlaceSpec inplace = InPlaceSpec::True) {
+  auto attr = op->getAttr(kInPlaceAttrName);
+  assert(!attr || attr.isa<ArrayAttr>());
+  SmallVector<StringRef> pos;
+  if (!attr) {
+    auto funcOp = dyn_cast<FuncOp>(op);
+    pos = funcOp ? SmallVector<StringRef>(funcOp.getNumArguments(),
+                                          stringify(InPlaceSpec::None))
+                 : SmallVector<StringRef>(op->getNumOperands(),
+                                          stringify(InPlaceSpec::None));
+  } else {
+    pos = llvm::to_vector<4>(
+        attr.cast<ArrayAttr>().getAsValueRange<StringAttr>());
+  }
+  LLVM_DEBUG(DBGS() << "Set inplace=" << stringify(inplace) << ": " << *op
+                    << " @idx=" << idx << "\n");
+  pos[idx] = stringify(inplace);
+  op->setAttr(kInPlaceAttrName, OpBuilder(op).getStrArrayAttr(pos));
+}
+
+static InPlaceSpec getInplace(Operation *op, unsigned operandIndex = 0) {
+  auto attr = op->getAttr(kInPlaceAttrName).dyn_cast_or_null<ArrayAttr>();
+  if (!attr) return InPlaceSpec::None;
+  assert(attr.size() > operandIndex);
+  // Must return a proper value.
+  return *symbolize(
+      *(attr.getAsValueRange<StringAttr>().begin() + operandIndex));
+}
+
+static Optional<int64_t> getResultFoldArgIndex(FuncOp op, unsigned resultIdx) {
+  auto attr = op->getAttr(kResultFoldArgAttrName).dyn_cast_or_null<ArrayAttr>();
+  if (!attr) return llvm::None;
+  APInt val = *(attr.getAsValueRange<IntegerAttr>().begin() + resultIdx);
+  int64_t res = val.getSExtValue();
+  if (res < 0) return llvm::None;
+  return res;
+}
+
+//===----------------------------------------------------------------------===//
+// Bufferization-specific MemRefType support.
+//===----------------------------------------------------------------------===//
+
+/// Return the contiguous MemRefType (i.e. with canonical/empty layout map) to
+/// which `type` can be bufferized to, assuming `type` is a RankedTensorType.
+static MemRefType getContiguousMemRefType(Type type,
+                                          ArrayRef<AffineMap> layout = {},
+                                          unsigned addressSpace = 0) {
+  RankedTensorType tensorType = type.cast<RankedTensorType>();
+  return MemRefType::get(tensorType.getShape(), tensorType.getElementType(),
+                         layout, addressSpace);
+}
+
+/// Return a MemRefType to which the `tensorType` can be bufferized in a
+/// composable fashion. The layout must be the most dynamic possible and
+/// canonicalize away once bufferization is finished.
+static MemRefType getDynamicMemRefType(RankedTensorType tensorType,
+                                       unsigned addressSpace = 0) {
+  // TODO: address space decisions to connect with the actual alloc.
+  int64_t dynamicOffset = ShapedType::kDynamicStrideOrOffset;
+  SmallVector<int64_t> dynamicStrides(tensorType.getRank(),
+                                      ShapedType::kDynamicStrideOrOffset);
+  AffineMap stridedLayout = makeStridedLinearLayoutMap(
+      dynamicStrides, dynamicOffset, tensorType.getContext());
+  return MemRefType::get(tensorType.getShape(), tensorType.getElementType(),
+                         stridedLayout, addressSpace);
+}
+
+// Transfer all `dim` ops on `tensor` to `memref`.
+static void transferDimOpsToMemref(Value tensor, Value memref) {
+  for (OpOperand &opOperand : llvm::make_early_inc_range(tensor.getUses())) {
+    if (isa<DimOp>(opOperand.getOwner())) {
+      opOperand.set(memref);
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Bufferization-specific BlockAndValueMapping support with debugging.
+//===----------------------------------------------------------------------===//
+
+/// Wrapper for better debugging.
+static void map(BlockAndValueMapping &bvm, ValueRange key, ValueRange value) {
+  if (key.empty()) return;
+  LLVM_DEBUG(DBGS() << "Map: " << key.front() << " to " << value.front()
+                    << "\n");
+  return bvm.map(key, value);
+}
+
+/// Wrapper for better debugging.
+static void map(BlockAndValueMapping &bvm, Value key, Value value) {
+  LLVM_DEBUG(DBGS() << "Map: " << key << " to " << value << "\n");
+  return bvm.map(key, value);
+}
+
+/// Wrapper for better debugging.
+static Value lookup(BlockAndValueMapping &bvm, Value key) {
+  if (!bvm.lookupOrNull(key)) {
+    MemRefType memRefType =
+        getDynamicMemRefType(key.getType().cast<RankedTensorType>());
+    Operation *op = key.getDefiningOp() ? key.getDefiningOp()
+                                        : key.getParentBlock()->getParentOp();
+    OpBuilder b(op->getContext());
+    // No InsertionGuard needed here.
+    if (auto blockArg = key.dyn_cast<BlockArgument>())
+      b.setInsertionPointToStart(blockArg.getParentBlock());
+    else
+      b.setInsertionPointAfter(op);
+    map(bvm, key, b.create<TensorToMemrefOp>(op->getLoc(), memRefType, key));
+  }
+  return bvm.lookup(key);
+}
+
+//===----------------------------------------------------------------------===//
+// Bufferization-specific inplace pattern matching support.
+//===----------------------------------------------------------------------===//
+
+/// First assign `op` if `slice.back()` isa `T`, then check condition.
+/// If anything fails just return failure. Otherwise update `sliceRef` by
+/// dropping `sliceRef.back()`, then return success().
+template <typename T>
+static LogicalResult matchAndDropBack(
+    ArrayRef<Operation *> &sliceRef, T &op,
+    llvm::function_ref<LogicalResult(T)> condition = nullptr) {
+  if (sliceRef.empty()) return failure();
+  op = dyn_cast<T>(sliceRef.back());
+  if (!op || (condition && failed(condition(op)))) return failure();
+  sliceRef = sliceRef.drop_back();
+  return success();
+}
+
+/// First assign `op1`/`op2` if `slice.front()`/`slice.back()` isa `T1`/`T2`,
+/// respectively. Then check condition. If anything fails just return failure.
+/// Otherwise update `sliceRef` by dropping `sliceRef.front()` and
+/// `sliceRef.back()`, then return success().
+template <typename T1, typename T2>
+static LogicalResult matchAndDropEnclosingPair(
+    ArrayRef<Operation *> &sliceRef, T1 &op1, T2 &op2,
+    llvm::function_ref<LogicalResult(T1, T2)> condition = nullptr) {
+  if (sliceRef.size() < 2) return failure();
+  op1 = dyn_cast<T1>(sliceRef.front());
+  op2 = dyn_cast<T2>(sliceRef.back());
+  if (!op1 || !op2 || (condition && failed(condition(op1, op2))))
+    return failure();
+  sliceRef = sliceRef.drop_front().drop_back();
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Bufferization-specific scoped alloc/dealloc insertion support.
+//===----------------------------------------------------------------------===//
+
+// TODO: need to hoist this across function boundaries. Maybe by using
+// init_tensor + subtensor_insert.
+static Value createNewAllocDeallocPairForShapedValue(
+    OpBuilder &b, Location loc, Value shapedValue,
+    SmallVector<Value, 4> dynOperands = {}) {
+  MemRefType memRefType = shapedValue.getType().dyn_cast<MemRefType>();
+  assert(memRefType || shapedValue.getType().dyn_cast<RankedTensorType>());
+  // TODO: non-zero address space.
+  // TODO: layout information if relevant.
+  if (!memRefType) memRefType = getContiguousMemRefType(shapedValue.getType());
+
+  OpBuilder::InsertionGuard g(b);
+  if (auto bbArg = shapedValue.dyn_cast<BlockArgument>()) {
+    b.setInsertionPointToStart(bbArg.getOwner());
+    loc = bbArg.getOwner()->getParentOp()->getLoc();
+  } else {
+    b.setInsertionPointAfter(shapedValue.getDefiningOp());
+    loc = shapedValue.getDefiningOp()->getLoc();
+  }
+
+  // If the dynOperands are not passed explicity, copmpute them.
+  // This circumvents currently missing dim(init_tensor) canonicalizations.
+  if (dynOperands.empty()) {
+    for (auto dim : llvm::enumerate(memRefType.getShape()))
+      if (dim.value() == ShapedType::kDynamicSize)
+        dynOperands.push_back(b.create<DimOp>(loc, shapedValue, dim.index()));
+  }
+  Value allocated = b.create<AllocOp>(loc, memRefType, dynOperands);
+  b.setInsertionPoint(allocated.getParentBlock()->getTerminator());
+  b.create<DeallocOp>(loc, allocated);
+  return allocated;
+}
+
+//===----------------------------------------------------------------------===//
+// Bufferization-specific inplace analysis support.
+//===----------------------------------------------------------------------===//
+
+/// Walk back the chain of known ops all the way to function arguments:
+///   - if an AllocOp, AllocaOp or InitTensorOp is met, return true.
+///   - if a LinalgOp is met, return true: either it is already known to trace
+///     back to a function arg that is writeable or it is already guaranteed to
+///     create an AllocOp into which we can write.
+///   - if the function argument is marked inplace, return true.
+///   - if the function argument is not marked inplace, return false.
+///   - if an unknown op is encountered, abort for now.
+static bool livesInWritableMemoryLocation(Value v) {
+  LLVM_DEBUG(DBGS() << "Start livesInWritableMemoryLocation @" << v << "\n");
+  bool done = false, res = false;
+  while (!done) {
+    // Scalar or vector value comes from a load, just return true.
+    if (!v.getType()
+             .isa<MemRefType, RankedTensorType, UnrankedMemRefType,
+                  UnrankedTensorType>())
+      return true;
+    if (auto bbArg = v.dyn_cast<BlockArgument>()) {
+      llvm::TypeSwitch<Operation *, void>(bbArg.getOwner()->getParentOp())
+          .Case([&](scf::ForOp forOp) {
+            v = forOp.getIterOperands()[bbArg.getArgNumber() - /*iv=*/1];
+          })
+          .Case([&](FuncOp funcOp) {
+            assert(bbArg.getType().isa<TensorType>() &&
+                   "already bufferized func");
+            if (getInplace(funcOp, bbArg.getArgNumber()) != InPlaceSpec::True)
+              res = false;
+            else
+              res = true;
+            done = true;
+          })
+          .Default([&](Operation *op) {
+            llvm::errs() << "In function:\n" << *op->getParentOfType<FuncOp>();
+            llvm::errs() << "\nUnsupported livesInWritableMemoryLocation "
+                         << *op << "\nstarting from value: " << v;
+            abort();
+          });
+      continue;
+    }
+    auto opResult = v.cast<OpResult>();
+    llvm::TypeSwitch<Operation *, void>(opResult.getOwner())
+        .Case([&](LinalgOp linalgOp) {
+          // TODO: uses implicit knowledge that output tensor matches result
+          // 1-1.
+          v = linalgOp.getOutputTensors()[opResult.getResultNumber()];
+        })
+        .Case<TensorToMemrefOp, TensorLoadOp, tensor::CastOp>(
+            [&](Operation *op) { v = op->getOperand(0); })
+        .Case<linalg::InitTensorOp, AllocOp, AllocaOp>([&](Operation *op) {
+          res = true;
+          done = true;
+        })
+        .Default([&](Operation *op) {
+          llvm::errs() << "In function:\n" << *op->getParentOfType<FuncOp>();
+          llvm::errs() << "\nUnsupported livesInWritableMemoryLocation " << *op
+                       << "\nstarting from value: " << v;
+          abort();
+        });
+  }
+  return res;
+}
+
+namespace {
+// Represent an inplace action that is to be committed as an Operation attribute
+// upon successful detection of a hain of ops that can be run inplace.
+struct InPlaceAction {
+  Operation *op;
+  SmallVector<unsigned> outputIndices;
+};
+}  // namespace
+
+/// Find simple forms of destructive update which writes over a yielded tensor
+/// without ever reading from it. For now, we only allow:
+/// ```
+///    vector.transfer_write -> subtensor_insert -> yield
+/// ```
+static void iterativeOverwritesAnalysis(Operation *parentOp,
+                                        ArrayRef<BlockArgument> candidates) {
+  if (!isa<scf::ForOp, FuncOp>(parentOp)) return;
+
+  for (auto en : llvm::enumerate(candidates)) {
+    Value candidate = en.value();
+    if (!candidate.getType().isa<ShapedType>()) continue;
+
+    LLVM_DEBUG(llvm::dbgs() << "\n\n");
+    LLVM_DEBUG(DBGS() << "Iterative overwrite analysis on candidate: "
+                      << candidate << "\nof:\n"
+                      << *parentOp << "\n");
+    if (!livesInWritableMemoryLocation(candidate)) continue;
+
+    llvm::SetVector<Operation *> slice;
+    getForwardSlice(candidate, &slice, [&](Operation *op) {
+      // Skip any extra nesting between parentOp and op.
+      return op == parentOp || op->getBlock()->getParentOp() == parentOp;
+    });
+
+    LLVM_DEBUG(DBGS() << "Iterative overwrite TRY:\n");
+    LLVM_DEBUG(llvm::for_each(
+        slice, [](Operation *op) { DBGS() << "Slice op: " << *op << "\n"; }));
+
+    // bbArg must be used exactly by one subtensor_insert + yield.
+    if (!candidate.hasOneUse()) {
+      LLVM_DEBUG(DBGS() << "bbArg does not have exactly 1 use."
+                           "\nIterative overwrite FAIL\n");
+      continue;
+    }
+    if (slice.size() != 2) {
+      LLVM_DEBUG(DBGS() << "Need exactly 2 ops in slice. "
+                           "\nIterative overwrite FAIL\n");
+      continue;
+    }
+
+    auto sliceRef = slice.getArrayRef();
+    // Match yieldOp and update sliceRef.
+    scf::YieldOp yieldOp;
+    if (failed(matchAndDropBack(sliceRef, yieldOp))) continue;
+
+    // Match subTensorInsertOp and update sliceRef.
+    SubTensorInsertOp subTensorInsertOp;
+    if (failed(matchAndDropBack(sliceRef, subTensorInsertOp))) continue;
+
+    // Optional vector::TransferWriteOp.
+    auto vectorTransferWriteOp =
+        subTensorInsertOp.source().getDefiningOp<vector::TransferWriteOp>();
+
+    // subtensor_insert must be used exactly by the yield at index `idx`.
+    unsigned idx = en.index();
+    if (!subTensorInsertOp.result().hasOneUse() ||
+        !isa<scf::YieldOp>(*subTensorInsertOp.result().getUsers().begin()) ||
+        subTensorInsertOp.result().getUses().begin()->getOperandNumber() !=
+            idx) {
+      LLVM_DEBUG(DBGS() << "SubTensorInsertOp does not have a single YieldOp "
+                           "use. \nIterative overwrite chain FAIL\n");
+      continue;
+    }
+
+    setInplace(parentOp, en.index());
+    if (vectorTransferWriteOp) setInplace(vectorTransferWriteOp);
+    setInplace(subTensorInsertOp);
+    setInplace(yieldOp, en.index());
+    LLVM_DEBUG(DBGS() << "Iterative overwrite chain SUCCESS\n");
+  }
+}
+
+/// Return true is all offsets, sizes and strides are equal.
+static LogicalResult sameOffsetsSizesAndStrides(
+    OffsetSizeAndStrideOpInterface op1, OffsetSizeAndStrideOpInterface op2) {
+  if (op1.static_offsets().size() != op2.static_offsets().size())
+    return failure();
+  if (op1.static_sizes().size() != op2.static_sizes().size()) return failure();
+  if (op1.static_strides().size() != op2.static_strides().size())
+    return failure();
+  for (auto it : llvm::zip(op1.getMixedOffsets(), op2.getMixedOffsets()))
+    if (!isEqualConstantIntOrValue(std::get<0>(it), std::get<1>(it)))
+      return failure();
+  for (auto it : llvm::zip(op1.getMixedSizes(), op2.getMixedSizes()))
+    if (!isEqualConstantIntOrValue(std::get<0>(it), std::get<1>(it)))
+      return failure();
+  for (auto it : llvm::zip(op1.getMixedStrides(), op2.getMixedStrides()))
+    if (!isEqualConstantIntOrValue(std::get<0>(it), std::get<1>(it)))
+      return failure();
+  return success();
+}
+
+static LogicalResult matchingVectorTransfersAtSource(
+    vector::TransferReadOp read, vector::TransferWriteOp write,
+    Value subtensor) {
+  // Either we have a pair of matching transfer read/write or none.
+  if (read && !write) {
+    LLVM_DEBUG(DBGS() << "Slice has transferReadOp but no transferWriteOp"
+                         "\nDestructive update chain FAIL\n");
+    return failure();
+  }
+  if (!read && write) {
+    LLVM_DEBUG(DBGS() << "Slice has transferWriteOp but no transferReadOp"
+                         "\nDestructive update chain FAIL\n");
+    return failure();
+  }
+  if (read && write) {
+    // If we have a pair of mathing read/write, the tensor and vector shape
+    // must exactly match (i.e. this is a vectorization).
+    if (read.source() != subtensor) {
+      LLVM_DEBUG(DBGS() << "transferReadOp.source() != subTensor.result()"
+                           "\nDestructive update chain FAIL\n");
+      return failure();
+    }
+    if (write.source() != subtensor) {
+      LLVM_DEBUG(DBGS() << "transferWriteOp.source() != subTensor.result()"
+                           "\nDestructive update chain FAIL\n");
+      return failure();
+    }
+    if (read.getShapedType().getShape() != read.getVectorType().getShape()) {
+      LLVM_DEBUG(DBGS() << "transferReadOp source and result shapes mismatch"
+                           "\nDestructive update chain FAIL\n");
+      return failure();
+    }
+    if (write.getShapedType().getShape() != write.getVectorType().getShape()) {
+      LLVM_DEBUG(DBGS() << "transferWriteOp source and result shapes mismatch"
+                           "\nDestructive update chain FAIL\n");
+      return failure();
+    }
+  }
+  return success();
+}
+
+/// In the case of an scf::ForOp, we look for:
+///   `candidate -> subtensor -> vector.transfer_read(*) -> ...
+///      vector.transfer_write(*) -> subtensor_insert -> return`.
+/// sliceRef is automaticaly updated to match `...`.
+///
+/// (*) represents an optional op in the chain, if a subtensor or
+/// vector.transfer is included, the matching op must be included too.
+static LogicalResult detectDestructiveUpdatePattern(
+    FuncOp parentOp, BlockArgument candidate, ArrayRef<Operation *> &sliceRef,
+    SmallVector<InPlaceAction> &inPlaceActions) {
+  if (!parentOp) return failure();
+
+  ReturnOp terminator;
+  // Match returnOp and update sliceRef.
+  if (failed(matchAndDropBack(sliceRef, terminator))) {
+    LLVM_DEBUG(DBGS() << "destructive update slice must end with a known "
+                         "terminator.\nDestructive update chain FAIL\n");
+    return failure();
+  }
+  return success();
+}
+
+/// In the case of an scf::ForOp, we look for:
+///   `candidate -> subtensor -> vector.transfer_read(*) -> ...
+///      vector.transfer_write(*) -> subtensor_insert -> yield`.
+/// sliceRef is automaticaly updated to match `...`.
+///
+/// (*) represents an optional op in the chain, if a subtensor or
+/// vector.transfer is included, the matching op must be included too.
+static LogicalResult detectDestructiveUpdatePattern(
+    scf::ForOp parentOp, BlockArgument candidate,
+    ArrayRef<Operation *> &sliceRef,
+    SmallVector<InPlaceAction> &inPlaceActions) {
+  if (!parentOp) return failure();
+
+  scf::YieldOp terminator;
+  SubTensorOp subTensorOp;
+  SubTensorInsertOp subTensorInsertOp;
+  vector::TransferReadOp vectorTransferReadOp;
+  vector::TransferWriteOp vectorTransferWriteOp;
+
+  // bbArg must be used exactly by one subtensor / subtensor_insert pair.
+  if (candidate.use_empty() || candidate.hasOneUse() ||
+      std::next(candidate.getUsers().begin(), 2) !=
+          candidate.getUsers().end()) {
+    LLVM_DEBUG(DBGS() << "bbArg does not have exactly 2 uses."
+                         "\nDestructive update chain FAIL\n");
+    return failure();
+  }
+  if (sliceRef.size() < 3) {
+    LLVM_DEBUG(DBGS() << "scf::ForOp destructive updated must have >= 3 ops."
+                         "\nDestructive update chain FAIL\n");
+    return failure();
+  }
+
+  // Match yieldOp and update sliceRef.
+  if (failed(matchAndDropBack(sliceRef, terminator))) {
+    LLVM_DEBUG(DBGS() << "destructive update slice must end with a known "
+                         "terminator.\nDestructive update chain FAIL\n");
+    return failure();
+  }
+
+  // Match subtensor pair and update sliceRef.
+  // subtensor / subtensor_insert must match.
+  auto matchSubTensors = [](SubTensorOp st, SubTensorInsertOp sti) {
+    auto res = sameOffsetsSizesAndStrides(st, sti);
+    if (failed(res))
+      LLVM_DEBUG(DBGS() << "subtensor ops don't match: " << st << " and " << sti
+                        << "\nDestructive update chain FAIL\n");
+    return res;
+  };
+  if (failed(matchAndDropEnclosingPair<SubTensorOp, SubTensorInsertOp>(
+          sliceRef, subTensorOp, subTensorInsertOp, matchSubTensors)))
+    return failure();
+
+  // subtensor_insert must be used exactly by the terminator at index `idx`.
+  unsigned idx = candidate.getArgNumber() - /*#iv=*/1;  // adjust for ForOp iv.
+  if (!subTensorInsertOp.result().hasOneUse() ||
+      terminator != *subTensorInsertOp.result().getUsers().begin() ||
+      terminator->getOperand(idx) != subTensorInsertOp.result()) {
+    LLVM_DEBUG(
+        DBGS() << "SubTensorInsertOp does not have a single terminator use "
+                  "at the right index.\nDestructive update chain FAIL\n");
+    return failure();
+  }
+
+  // Maybe match vector transfer pair and update sliceRef.
+  // If we find one, the other must be present and match too.
+  auto matchTransfers = [&](vector::TransferReadOp read,
+                            vector::TransferWriteOp write) {
+    return matchingVectorTransfersAtSource(read, write, subTensorOp.result());
+  };
+  if (failed(matchAndDropEnclosingPair<vector::TransferReadOp,
+                                       vector::TransferWriteOp>(
+          sliceRef, vectorTransferReadOp, vectorTransferWriteOp,
+          matchTransfers)) &&
+      (vectorTransferReadOp || vectorTransferWriteOp))
+    return failure();
+
+  // Commit what has been detected.
+  inPlaceActions.push_back(InPlaceAction{subTensorOp});
+  if (vectorTransferReadOp)
+    inPlaceActions.push_back(InPlaceAction{vectorTransferReadOp});
+  if (vectorTransferWriteOp)
+    inPlaceActions.push_back(InPlaceAction{vectorTransferWriteOp});
+  inPlaceActions.push_back(InPlaceAction{subTensorInsertOp});
+  inPlaceActions.push_back(InPlaceAction{terminator, {idx}});
+
+  return success();
+}
+
+/// Iterate over bbArgs of `parentOp` and determine if they are the root of a
+/// destructive update chain such as:
+/// ```
+///    scf.for bbArg -> subtensor -> DAG of admissible inPlaceActions
+///      -> subtensor_insert -> yield.
+/// ```
+/// Such a representation is related to traditional loop nest + memory analysis
+/// but provides a simpler abstraction.
+/// In traditional memory-based dependence analysis, one would need to analyze
+/// all possible interleavings of possibly aliasing loads and stores in the
+/// context of the k-common surrounding loops.
+/// With scf.for + subtensor + subtensor_insert + yield, more ordering semantics
+/// are available as well as dealiasing thanks to SSA use-def chains.
+static void destructiveUpdateAnalysis(Operation *parentOp,
+                                      ArrayRef<BlockArgument> candidates) {
+  for (auto en : llvm::enumerate(candidates)) {
+    BlockArgument candidate = en.value();
+    if (!candidate.getType().isa<ShapedType>()) continue;
+
+    LLVM_DEBUG(llvm::dbgs() << "\n\n");
+    LLVM_DEBUG(DBGS() << "Destructive update analysis on candidate: "
+                      << candidate << "\nof:\n"
+                      << *parentOp << "\n");
+    if (!livesInWritableMemoryLocation(candidate)) continue;
+
+    llvm::SetVector<Operation *> slice;
+    getForwardSlice(candidate, &slice, [&](Operation *op) {
+      // Skip any extra nesting between parentOp and op.
+      return op == parentOp || op->getBlock()->getParentOp() == parentOp;
+    });
+
+    LLVM_DEBUG(DBGS() << "Slice:\n");
+    for (auto *op : slice) LLVM_DEBUG(DBGS() << *op << "\n");
+
+    SmallVector<InPlaceAction> inPlaceActions;
+    inPlaceActions.reserve(slice.size());
+    ArrayRef<Operation *> sliceRef = slice.getArrayRef();
+    if (failed(detectDestructiveUpdatePattern(dyn_cast<scf::ForOp>(parentOp),
+                                              candidate, sliceRef,
+                                              inPlaceActions)) &&
+        failed(detectDestructiveUpdatePattern(
+            dyn_cast<FuncOp>(parentOp), candidate, sliceRef, inPlaceActions))) {
+      LLVM_DEBUG(DBGS() << "Failed to detect: Destructive update chain FAIL\n");
+      continue;
+    }
+
+    // Add the current op and add pattern eagerly to simplify implementation.
+    inPlaceActions.push_back(
+        {parentOp, {static_cast<unsigned int>(en.index())}});
+    for (auto &action : inPlaceActions) {
+      if (action.outputIndices.empty()) setInplace(action.op);
+      for (unsigned idx : action.outputIndices) setInplace(action.op, idx);
+    }
+  }
+
+  parentOp->walk([](Operation *op) {
+    if (isa<TensorLoadOp, TensorToMemrefOp>(op)) setInplace(op);
+    if (auto linalgOp = dyn_cast<LinalgOp>(op)) {
+      // For now, just check that the operand and corresponding result have
+      // 0 uses. In the future we can build a cost-model to take care of
+      // diamond dependences.
+      unsigned resultIdx = 0;
+      for (auto &opOperand : linalgOp.getOutputTensorsOpOperands()) {
+        if (opOperand->get().hasOneUse() &&
+            linalgOp->getResult(resultIdx).hasOneUse())
+          setInplace(op, opOperand->getOperandNumber());
+        ++resultIdx;
+      }
+    }
+  });
+}
+
+static FuncOp getCalledFunction(CallOpInterface callOp) {
+  SymbolRefAttr sym = callOp.getCallableForCallee().dyn_cast<SymbolRefAttr>();
+  if (!sym) return nullptr;
+  return dyn_cast_or_null<FuncOp>(
+      SymbolTable::lookupNearestSymbolFrom(callOp, sym));
+}
+
+static void inplaceAnalysisFuncOpInternals(FuncOp funcOp) {
+  funcOp.walk([&](scf::ForOp forOp) {
+    iterativeOverwritesAnalysis(forOp, forOp.getRegionIterArgs());
+  });
+  iterativeOverwritesAnalysis(funcOp, funcOp.getArguments());
+  funcOp.walk([&](scf::ForOp forOp) {
+    destructiveUpdateAnalysis(forOp, forOp.getRegionIterArgs());
+  });
+  destructiveUpdateAnalysis(funcOp, funcOp.getArguments());
+}
+
+/// Analyse a `callOp` to a FuncOp and determine whether any of its tensor
+/// operand could be safely written inplace after it is converted to buffer
+/// form by a bufferization process. Iterate on the uses of callOp's operands
+/// to determine whether all such uses dominate callOp. If any use of an
+/// operand does not dominate `callOp`, this means that the operand tensor
+/// value may be needed somewhere else and it is illegal to update in-place
+/// after bufferization. Add a `kInPlaceAttrName` string attribute to `callOp`
+/// to carry the result of this analysis until bufferization is completed. The
+/// "meet" of all `kInPlaceAttrName` for all `callOp` to a given FuncOp
+/// determines the `kInPlaceAttrName` for that FuncOp.
+static void inplaceFunctionArgumentAnalysis(CallOpInterface callOp,
+                                            DominanceInfo &domInfo) {
+  FuncOp funcOp = getCalledFunction(callOp);
+  if (!funcOp) return;
+
+  if (llvm::none_of(callOp->getOperandTypes(),
+                    [](Type t) { return t.isa<TensorType>(); }))
+    return;
+
+  LLVM_DEBUG(DBGS() << "Begin inplaceFunctionArgumentAnalysis within:\n"
+                    << *callOp->getParentOfType<FuncOp>()
+                    << "callOp: " << *callOp << "\n";);
+  for (OpOperand &opOperand : callOp->getOpOperands()) {
+    Value tensor = opOperand.get();
+    if (!tensor.getType().isa<TensorType>()) continue;
+
+    unsigned idx = opOperand.getOperandNumber();
+    LLVM_DEBUG(DBGS() << "tensor @idx=" << idx << ": " << tensor << "\n");
+
+    // For now, assume any use is a read.
+    // Write-only is a non-problem: will represent with shapes in the future.
+    // If any use of the tensor does not properly dominate callOp, we can't
+    // bufferize the tensor inplace.
+    InPlaceSpec callInPlace = InPlaceSpec::True;
+    for (auto &use : tensor.getUses()) {
+      Operation *user = use.getOwner();
+      if (domInfo.properlyDominates(user, callOp)) continue;
+      if (use.getOperandNumber() == idx) continue;
+      LLVM_DEBUG(DBGS() << "non-properly dominate user: " << *user << "\n");
+      callInPlace = InPlaceSpec::False;
+      break;
+    }
+    // CallOp instance can immediately determine whether it allows inplace.
+    setInplace(callOp, idx, callInPlace);
+    // FuncOp inplace is the meet of all the calls.
+    InPlaceSpec funcInPlace = getInplace(funcOp, idx);
+    if (funcInPlace == InPlaceSpec::False) continue;
+    setInplace(funcOp, idx, callInPlace);
+  }
+
+  LLVM_DEBUG(DBGS() << "End inplaceFunctionArgumentAnalysis within:\n"
+                    << *callOp->getParentOfType<FuncOp>()
+                    << "callOp: " << *callOp << "\n";);
+}
+
+//===----------------------------------------------------------------------===//
+// Bufferization as simple BlockAndValueMapping rewrites / without
+// conversions.
+//===----------------------------------------------------------------------===//
+
+/// Non-conversion equivalent of the core MLIR Linalg bufferization patterns.
+/// This works on mixed tensor + buffer Linalg ops: some results may have been
+/// already bufferized by a previous destructive update bufferization.
+/// Allocate the output buffers for the remaining tensor output operands of
+/// the Linalg op. If the tensor is an "init" tensor (i.e. its value is
+/// actually used in the payload region), we additionally copy the original
+/// value into the newly allocated buffer.
+static LogicalResult allocateBuffersForResults(
+    OpBuilder &b, Location loc, LinalgOp op,
+    SmallVectorImpl<Value> &resultBuffers, BlockAndValueMapping &bvm) {
+  // Lazily compute loopRanges.
+  SmallVector<Range, 4> loopRanges;
+
+  // Linalg invariant: output tensors and result match 1-1.
+  assert(op.getNumOutputTensors() == op->getNumResults());
+  for (auto &opOperand : op.getOutputOpOperands()) {
+    Value output = opOperand.get();
+    if (output.getType().isa<MemRefType>()) {
+      resultBuffers.push_back(output);
+      continue;
+    }
+
+    // If output tensor is marked inplace, just use the buffer.
+    if (getInplace(op, opOperand.getOperandNumber()) == InPlaceSpec::True) {
+      resultBuffers.push_back(lookup(bvm, output));
+      continue;
+    }
+
+    Value dimTensor = bvm.lookupOrDefault(output);
+    Value alloc = createNewAllocDeallocPairForShapedValue(b, loc, dimTensor);
+    resultBuffers.push_back(alloc);
+
+    // Additionally, if the output buffer is used, clone its value for now.
+    if (op.payloadUsesValueFromOpOperand(&opOperand))
+      b.create<CopyOp>(loc, lookup(bvm, output), alloc);
+  }
+  map(bvm, op->getResults(), resultBuffers);
+  for (auto it : llvm::zip(op->getResults(), resultBuffers)) {
+    transferDimOpsToMemref(std::get<0>(it), std::get<1>(it));
+  }
+  return success();
+}
+
+// Non-conversion equivalent of the core MLIR Linalg bufferization patterns.
+static void finalizeBufferAllocation(OpBuilder &b, LinalgOp op,
+                                     ValueRange inputs, ValueRange outputs,
+                                     BlockAndValueMapping &bvm) {
+  SmallVector<Value, 8> newOperands = inputs;
+  newOperands.append(outputs.begin(), outputs.end());
+  auto otherOperands = op.getAssumedNonShapedOperands();
+  newOperands.append(otherOperands.begin(), otherOperands.end());
+  Location loc = op.getLoc();
+  op.clone(b, loc, /*resultTypes=*/TypeRange{}, newOperands);
+
+  // Replace the results of the old op with the new output buffers.
+  map(bvm, op.getOperation()->getResults(), outputs);
+  for (auto it : llvm::zip(op.getOperation()->getResults(), outputs)) {
+    transferDimOpsToMemref(std::get<0>(it), std::get<1>(it));
+  }
+
+  if (!op.hasTensorSemantics()) op->erase();
+}
+
+/// Generic conversion pattern that matches any LinalgOp. This avoids
+/// template instantiating one pattern for each LinalgOp.
+/// This works on mixed tensor + buffer Linalg ops: some results may have been
+/// already bufferized by a previousdestructive update bufferization.
+static LogicalResult convertAnyLinalgOp(OpBuilder &b, LinalgOp op,
+                                        BlockAndValueMapping &bvm) {
+  if (op.hasBufferSemantics()) return failure();
+
+  LLVM_DEBUG(DBGS() << "convertAnyLinalgOp: " << *op << "\n");
+
+  OpBuilder::InsertionGuard g(b);
+  b.setInsertionPoint(op);
+  Location loc = op.getLoc();
+  SmallVector<Value, 2> newInputBuffers;
+  newInputBuffers.reserve(op.getNumInputs());
+  for (Value v : op.getInputs()) {
+    newInputBuffers.push_back(lookup(bvm, v));
+  }
+  SmallVector<Value, 2> newOutputBuffers;
+  if (failed(allocateBuffersForResults(b, loc, op, newOutputBuffers, bvm)))
+    assert(false);
+
+  // Delegate to the linalg generic pattern.
+  if (auto genericOp = dyn_cast<GenericOp>(op.getOperation())) {
+    finalizeBufferAllocation(b, genericOp, newInputBuffers, newOutputBuffers,
+                             bvm);
+    return success();
+  }
+
+  SmallVector<Value, 2> newResults;
+  for (OpOperand &outputOpOperand : op.getOutputOpOperands()) {
+    Value output = outputOpOperand.get();
+    if (output.getType().isa<MemRefType>()) continue;
+    auto tensorType = output.getType().cast<RankedTensorType>();
+    OpBuilder::InsertionGuard g(b);
+    b.setInsertionPointAfter(op);
+    Value tensor = b.create<TensorLoadOp>(
+        loc, tensorType,
+        newOutputBuffers[outputOpOperand.getOperandNumber() -
+                         op.getNumInputs()]);
+    newResults.push_back(tensor);
+    map(bvm, tensor,
+        newOutputBuffers[outputOpOperand.getOperandNumber() -
+                         op.getNumInputs()]);
+  }
+  // Can't just map.
+  // map(bvm, op.getOutputs(), newOutputBuffers);
+  // map(bvm, op->getResults(), newResults);
+  // Must explicitly push value out because conume ops are not guaranteed to
+  // pull the value from bvm (e.g. scf.for with core bufferization use
+  // conversion patterns).
+  op->replaceAllUsesWith(newResults);
+
+  finalizeBufferAllocation(b, op, newInputBuffers, newOutputBuffers, bvm);
+
+  return success();
+}
+
+static LogicalResult convertTransferOp(OpBuilder &b,
+                                       VectorTransferOpInterface op,
+                                       BlockAndValueMapping &bvm) {
+  if (op.getShapedType().isa<MemRefType>()) return failure();
+
+  assert(op->getNumResults() == 1);
+  Value outputTensor = op->getResult(0);
+  OpBuilder::InsertionGuard g(b);
+  b.setInsertionPoint(op);
+  Location loc = op.getLoc();
+  Value newInputBuffer = lookup(bvm, op.source());
+  if (auto tensorType =
+          op->getResult(0).getType().dyn_cast<RankedTensorType>()) {
+    Value tensor = bvm.lookupOrDefault(outputTensor);
+    Value alloc = createNewAllocDeallocPairForShapedValue(b, loc, tensor);
+    map(bvm, op->getResult(0), alloc);
+    transferDimOpsToMemref(op->getResult(0), alloc);
+  }
+
+  // Replace the tensor operand.
+  if (auto readOp = dyn_cast<vector::TransferReadOp>(op.getOperation())) {
+    readOp.sourceMutable().assign(newInputBuffer);
+  } else {
+    auto writeOp = cast<vector::TransferWriteOp>(op.getOperation());
+    // Create a new transfer_write on buffer that doesn't have a return value.
+    // Leave the previous transfer_write to dead code as it still has uses at
+    // this point.
+    b.create<vector::TransferWriteOp>(
+        loc, writeOp.vector(), newInputBuffer, writeOp.indices(),
+        writeOp.permutation_map(),
+        writeOp.masked() ? *writeOp.masked() : ArrayAttr());
+
+    Value tensor = b.create<TensorLoadOp>(
+        loc, writeOp.getResult(0).getType().cast<RankedTensorType>(),
+        newInputBuffer);
+    SmallVector<Value, 1> newResult(1, {tensor});
+    writeOp.replaceAllUsesWith(newResult);
+    map(bvm, tensor, newInputBuffer);
+  }
+  return success();
+}
+
+static LogicalResult convertInitTensorOp(OpBuilder &b,
+                                         InitTensorOp initTensorOp,
+                                         BlockAndValueMapping &bvm) {
+  OpBuilder::InsertionGuard g(b);
+  b.setInsertionPoint(initTensorOp);
+  Value alloc = createNewAllocDeallocPairForShapedValue(
+      b, initTensorOp->getLoc(), initTensorOp.result(), initTensorOp.sizes());
+  map(bvm, initTensorOp.result(), alloc);
+  return success();
+}
+
+static LogicalResult convertPadTensorOp(OpBuilder &b, PadTensorOp padTensorOp,
+                                        BlockAndValueMapping &bvm) {
+  OpBuilder::InsertionGuard g(b);
+  b.setInsertionPoint(padTensorOp);
+  auto tensorType = padTensorOp.result().getType().cast<RankedTensorType>();
+  auto sourceMemRef = lookup(bvm, padTensorOp.source());
+  auto sourceMemRefType = sourceMemRef.getType().cast<MemRefType>();
+  auto memRefType =
+      getContiguousMemRefType(tensorType, sourceMemRefType.getAffineMaps(),
+                              sourceMemRefType.getMemorySpaceAsInt());
+  Value res =
+      b.create<MemRefCastOp>(padTensorOp.getLoc(), memRefType, sourceMemRef);
+  map(bvm, padTensorOp.result(), res);
+  return success();
+}
+
+static LogicalResult convertSubTensorInsertOp(
+    OpBuilder &b, SubTensorInsertOp subTensorInsertOp,
+    BlockAndValueMapping &bvm) {
+  Location loc = subTensorInsertOp.getLoc();
+  OpBuilder::InsertionGuard g(b);
+  b.setInsertionPoint(subTensorInsertOp);
+  Value dstMemref = lookup(bvm, subTensorInsertOp.dest());
+  auto dstMemrefType = dstMemref.getType().cast<MemRefType>();
+  Value srcMemref = lookup(bvm, subTensorInsertOp.source());
+  auto subviewMemRefType =
+      SubViewOp::inferRankReducedResultType(
+          subTensorInsertOp.getSourceType().getRank(), dstMemrefType,
+          subTensorInsertOp.getMixedOffsets(),
+          subTensorInsertOp.getMixedSizes(),
+          subTensorInsertOp.getMixedStrides())
+          .cast<MemRefType>();
+  // Take a subview of the dst.
+  Value subView = b.create<SubViewOp>(
+      loc, subviewMemRefType, dstMemref, subTensorInsertOp.getMixedOffsets(),
+      subTensorInsertOp.getMixedSizes(), subTensorInsertOp.getMixedStrides());
+  // Linalg op and vector.transfer_write producers directly write their output
+  // buffer. If the producer is not one of these ops or if it subtensor_insert
+  // is not marked inplace, we ened to copy.
+  bool isInPlaceProducer =
+      subTensorInsertOp.source().getDefiningOp<LinalgOp>() ||
+      subTensorInsertOp.source().getDefiningOp<vector::TransferWriteOp>();
+  if (!isInPlaceProducer || getInplace(subTensorInsertOp) != InPlaceSpec::True)
+    b.create<CopyOp>(subTensorInsertOp.getLoc(), srcMemref, subView);
+  Value tensor = b.create<TensorLoadOp>(
+      loc, subTensorInsertOp->getResult(0).getType(), dstMemref);
+  SmallVector<Value, 1> newResult(1, {tensor});
+  subTensorInsertOp->replaceAllUsesWith(newResult);
+  map(bvm, tensor, dstMemref);
+  return success();
+}
+
+static LogicalResult convertSubTensorOp(OpBuilder &b, SubTensorOp subTensor,
+                                        BlockAndValueMapping &bvm) {
+  Location loc = subTensor.getLoc();
+  OpBuilder::InsertionGuard g(b);
+  b.setInsertionPoint(subTensor);
+  Value srcMemref = lookup(bvm, subTensor.source());
+  auto srcMemrefType = srcMemref.getType().cast<MemRefType>();
+  auto dstTensorType = subTensor.result().getType().cast<RankedTensorType>();
+
+  auto subviewMemRefType =
+      SubViewOp::inferRankReducedResultType(
+          dstTensorType.getRank(), srcMemrefType, subTensor.getMixedOffsets(),
+          subTensor.getMixedSizes(), subTensor.getMixedStrides())
+          .cast<MemRefType>();
+
+  Value subView = b.create<SubViewOp>(
+      loc, subviewMemRefType, srcMemref, subTensor.getMixedOffsets(),
+      subTensor.getMixedSizes(), subTensor.getMixedStrides());
+  map(bvm, subTensor.result(), subView);
+  return success();
+}
+
+static LogicalResult convertTensorCastOp(OpBuilder &b, tensor::CastOp castOp,
+                                         BlockAndValueMapping &bvm) {
+  OpBuilder::InsertionGuard g(b);
+  b.setInsertionPoint(castOp);
+  auto sourceMemRefType =
+      lookup(bvm, castOp.source()).getType().dyn_cast<MemRefType>();
+  Type memRefType;
+  TensorType tensorType = castOp.getResult().getType().cast<TensorType>();
+  if (tensorType.isa<UnrankedTensorType>()) {
+    memRefType = UnrankedMemRefType::get(
+        tensorType.getElementType(), sourceMemRefType.getMemorySpaceAsInt());
+  } else {
+    memRefType =
+        getContiguousMemRefType(tensorType, sourceMemRefType.getAffineMaps(),
+                                sourceMemRefType.getMemorySpaceAsInt());
+  }
+  Value res = b.create<MemRefCastOp>(castOp.getLoc(), memRefType,
+                                     lookup(bvm, castOp.source()));
+  map(bvm, castOp.getResult(), res);
+  return success();
+}
+
+static void bufferizeFunctionCallBoundaries(FuncOp funcOp) {
+  // kResultFoldArgAttrName is set once funcOp is bufferized.
+  if (funcOp->getAttr(kResultFoldArgAttrName)) return;
+
+  SmallVector<int64_t> resultArgumentFolding(
+      funcOp.type().cast<FunctionType>().getNumResults(), -1);
+
+  LLVM_DEBUG(DBGS() << "Begin bufferizeFunctionCallBoundaries:\n" << funcOp);
+
+  // Take the terminator (assume the last block is the only one that has it).
+  auto returnOp = cast<ReturnOp>(funcOp.body().back().getTerminator());
+  for (OpOperand &returnOpOperand : returnOp->getOpOperands()) {
+    Value returnValue = returnOpOperand.get();
+    unsigned returnIndex = returnOpOperand.getOperandNumber();
+    if (!returnValue.getType().isa<RankedTensorType>()) continue;
+
+    // If returned value is a bbArg, it only folds if it is a function
+    // argument.
+    BlockArgument bbArg = returnValue.dyn_cast<BlockArgument>();
+    if (bbArg) {
+      if (returnValue == funcOp.getArgument(bbArg.getArgNumber()))
+        resultArgumentFolding[returnIndex] = bbArg.getArgNumber();
+      else
+        continue;
+    }
+
+    // Otherwise we look for tensor_load(tensor_to_memref(bbarg)).
+    auto tensorLoadOp = returnValue.getDefiningOp<TensorLoadOp>();
+    if (!tensorLoadOp) continue;
+    auto tensorToMemRefOp =
+        tensorLoadOp.memref().getDefiningOp<TensorToMemrefOp>();
+    if (!tensorToMemRefOp) continue;
+
+    // If returned value is a bbArg, it only folds if it is a function
+    // argument.
+    bbArg = tensorToMemRefOp.tensor().dyn_cast<BlockArgument>();
+    if (bbArg) {
+      if (bbArg == funcOp.getArgument(bbArg.getArgNumber()))
+        resultArgumentFolding[returnIndex] = bbArg.getArgNumber();
+      else
+        continue;
+    }
+  }
+
+  funcOp->setAttr(kResultFoldArgAttrName,
+                  OpBuilder(funcOp).getI64ArrayAttr(resultArgumentFolding));
+
+  OpBuilder b(returnOp);
+  SmallVector<Value> returnValues;
+  for (auto en : enumerate(resultArgumentFolding)) {
+    LLVM_DEBUG(DBGS() << "return idx: " << en.index() << " folds on "
+                      << en.value() << "\n");
+    // Return value folds on some input.
+    if (en.value() >= 0) continue;
+
+    // Return value does not fold, add it to the new return op.
+    Value unfolded = returnOp->getOperand(en.index());
+    if (auto tensorLoadOp = unfolded.getDefiningOp<TensorLoadOp>()) {
+      unfolded = tensorLoadOp.memref();
+      for (Operation *user : llvm::make_early_inc_range(unfolded.getUsers()))
+        if (isa<DeallocOp>(user)) user->erase();
+    }
+    returnValues.push_back(unfolded);
+    llvm::errs() << "return val does not fold: " << returnValues.back() << "\n";
+  }
+  b.create<ReturnOp>(returnOp.getLoc(), returnValues);
+  returnOp->erase();
+
+  auto argTypes = llvm::to_vector<4>(
+      llvm::map_range(funcOp.getArguments(), [](BlockArgument bbArg) -> Type {
+        // TODO: non-zero address space.
+        // TODO: layout information if relevant.
+        if (auto tensorType = bbArg.getType().dyn_cast<RankedTensorType>())
+          return getContiguousMemRefType(tensorType);
+        return bbArg.getType();
+      }));
+  funcOp.setType(FunctionType::get(funcOp->getContext(), argTypes,
+                                   ValueRange{returnValues}.getTypes()));
+  Block &frontBlock = funcOp.body().front();
+  for (unsigned idx = 0, e = frontBlock.getNumArguments(); idx < e; ++idx) {
+    auto bbArg = frontBlock.getArgument(0);
+    auto tensorType = bbArg.getType().dyn_cast<RankedTensorType>();
+    if (!tensorType) {
+      frontBlock.addArgument(bbArg.getType());
+      bbArg.replaceAllUsesWith(frontBlock.getArguments().back());
+    } else {
+      // TODO: non-zero address space.
+      // TODO: layout information if relevant.
+      Value memref =
+          frontBlock.addArgument(getContiguousMemRefType(tensorType));
+      OpBuilder b(funcOp->getContext());
+      // No InsertionGuard needed here.
+      b.setInsertionPointToStart(&frontBlock);
+      Value tensor = b.create<TensorLoadOp>(funcOp->getLoc(), memref);
+      bbArg.replaceAllUsesWith(tensor);
+    }
+    frontBlock.eraseArgument(0);
+  }
+
+  LLVM_DEBUG(DBGS() << "End bufferizeFunctionCallBoundaries:\n" << funcOp);
+}
+
+/// Bufferize a single function call.
+/// Look for the following pattern for each result to determine whether it can
+/// fold onto an argument:
+/// ```
+///    func @foo(%A: tensor<...>, ..., %Z: tensor<...>) ->
+///      (tensor<...>, ..., tensor<...>)
+///      #inplace_attr_specification
+///    {
+///       %p = tensor_to_memref(%some_arg): ...
+///       ... // uses of %p (read or writes)
+///       %t = tensor_load %p: ...
+///       return ..., %t, ...: ..., tensor<...>, ...
+///    }
+/// ```
+static void bufferizeFunctionCall(CallOpInterface callOp,
+                                  DominanceInfo &domInfo) {
+  FuncOp funcOp = getCalledFunction(callOp);
+  if (!funcOp) return;
+  if (funcOp.body().empty()) return;
+
+  // Only bufferizes the first time `funcOp` is encountered.
+  bufferizeFunctionCallBoundaries(funcOp);
+
+  SmallVector<Value> newOperands;
+  for (Value v : callOp->getOperands()) {
+    if (!v.getType().isa<RankedTensorType>()) {
+      newOperands.push_back(v);
+      continue;
+    }
+    if (auto tensorLoadOp = v.getDefiningOp<TensorLoadOp>()) {
+      newOperands.push_back(tensorLoadOp.memref());
+      continue;
+    }
+    llvm::errs() << "operand: " << v << "\n";
+    llvm_unreachable("Operand does not come from a tensor_load");
+  }
+
+  assert(isa<CallOp>(callOp.getOperation()) && "expected a CallOp");
+  OpBuilder b(callOp);
+  Operation *newCallOp = b.create<CallOp>(
+      callOp.getLoc(), funcOp.sym_name(),
+      funcOp.type().cast<FunctionType>().getResults(), newOperands);
+  newCallOp->setAttrs(callOp.getAttrs());
+
+  int numFoldedArgsSoFar = 0;
+  for (unsigned callRetIdx = 0, e = callOp->getNumResults(); callRetIdx < e;
+       ++callRetIdx) {
+    unsigned newCallReturnIdx = callRetIdx - numFoldedArgsSoFar;
+    auto maybeFoldedArgIndex = getResultFoldArgIndex(funcOp, callRetIdx);
+    if (maybeFoldedArgIndex) ++numFoldedArgsSoFar;
+
+    // If not a ranked tensor, no changes, just replace the new result.
+    if (!callOp->getResult(callRetIdx).getType().isa<RankedTensorType>()) {
+      assert(!maybeFoldedArgIndex);
+      callOp->getResult(callRetIdx)
+          .replaceAllUsesWith(newCallOp->getResult(newCallReturnIdx));
+      continue;
+    }
+
+    // If the old callOp result is a ranked tensor that does not fold on some
+    // input, then there must be an allocated return value.
+    // That value should be deallocated by the caller.
+    // That value should be lifted out of the callee at the first enclosing
+    // parallel scope. This lifting should be done to (the meet of) all
+    // callers before we can hoist the alloc out of the funcOp.
+    Value resultMemref = (maybeFoldedArgIndex)
+                             ? newOperands[*maybeFoldedArgIndex]
+                             : newCallOp->getResult(newCallReturnIdx);
+    callOp->getResult(callRetIdx)
+        .replaceAllUsesWith(
+            b.create<TensorLoadOp>(callOp.getLoc(), resultMemref));
+    OpBuilder::InsertionGuard g(b);
+    b.setInsertionPoint(callOp->getBlock()->getTerminator());
+    // If function returns a memref, it must be freed.
+    if (!maybeFoldedArgIndex)
+      b.create<DeallocOp>(callOp.getLoc(), resultMemref);
+  }
+
+  callOp->erase();
+}
+
+//===----------------------------------------------------------------------===//
+// Bufferization passes.
+//===----------------------------------------------------------------------===//
+
+// Transformations that run iteratively with bufferization.
+void LinalgComprehensiveBufferizePass::runEnablingTransforms(FuncOp funcOp) {
+  if (failed(runPipeline(enablingPassPipeline, funcOp)))
+    return signalPassFailure();
+  (void)runPipeline(enablingPassPipeline, funcOp);
+  linalg::hoistRedundantVectorTransfers(funcOp);
+  linalg::hoistRedundantVectorTransfersOnTensor(funcOp);
+}
+
+void LinalgComprehensiveBufferizePass::bufferizeFuncOpInternals(FuncOp funcOp) {
+  LLVM_DEBUG(DBGS() << "Start BufferizeFuncOpInternals:\n" << funcOp);
+
+  OpBuilder b(funcOp->getContext());
+  BlockAndValueMapping bvm;
+  bool changed = true;
+  // It is likely overkill to do this in a loop with canonicalization and
+  // hoisting but until we stabilize bufferization, c'est la vie.
+  while (changed) {
+    changed = false;
+    runEnablingTransforms(funcOp);
+
+    // CSE changes the result of the analysis, need to compute/mark/invalidate
+    // at each iteration.
+    inplaceAnalysisFuncOpInternals(funcOp);
+    auto guard = llvm::make_scope_exit([&] {
+      funcOp.walk([&](Operation *op) { op->removeAttr(kInPlaceAttrName); });
+    });
+
+    funcOp.walk([&](Operation *operation) {
+      llvm::TypeSwitch<Operation *, void>(operation)
+          // TensorLoadOp is not allowed to just fold into the memref!
+          // If it may alias, it must clone.
+          .Case([&](TensorLoadOp op) {
+            // TODO: reduce amount of surprise.
+            if (auto tensorToMemRef =
+                    op.memref().getDefiningOp<TensorToMemrefOp>()) {
+              // Folding is allowed thwn tensor_to_memref immediately
+              // precedes tensor_load -> no interleaved aliasing.
+              if (tensorToMemRef->getNextNode() == op) {
+                map(bvm, op.result(), op.memref());
+                changed = true;
+              }
+              // TODO: else clone.
+            }
+          })
+          .Case([&](TensorToMemrefOp op) {
+            // TODO: reduce amount of surprise.
+            Value repl = bvm.lookupOrDefault(op.tensor());
+            if (op.memref() != repl) {
+              op.memref().replaceAllUsesWith(repl);
+              op->erase();
+            }
+          })
+          .Case([&](InitTensorOp op) {
+            changed = succeeded(convertInitTensorOp(b, op, bvm));
+          })
+          .Case([&](SubTensorOp op) {
+            changed = succeeded(convertSubTensorOp(b, op, bvm));
+          })
+          .Case([&](SubTensorInsertOp op) {
+            changed = succeeded(convertSubTensorInsertOp(b, op, bvm));
+          })
+          .Case([&](tensor::CastOp op) {
+            changed = succeeded(convertTensorCastOp(b, op, bvm));
+          })
+          .Case([&](PadTensorOp op) {
+            changed = succeeded(convertPadTensorOp(b, op, bvm));
+          })
+          .Case([&](LinalgOp op) {
+            changed = succeeded(convertAnyLinalgOp(b, op, bvm));
+          })
+          .Case([&](VectorTransferOpInterface op) {
+            changed = succeeded(convertTransferOp(b, op, bvm));
+          });
+    });
+
+    LLVM_DEBUG(DBGS() << "BufferizeFuncOpInternals step:\n" << funcOp);
+  }
+}
+
+namespace mlir {
+std::unique_ptr<Pass> createLinalgComprehensiveBufferizePass() {
+  return std::make_unique<LinalgComprehensiveBufferizePass>();
+}
+namespace linalg {
+void registerLinalgComprehensiveBufferizePass() {
+  PassRegistration<LinalgComprehensiveBufferizePass> pass(
+      "linalg-comprehensive-bufferize-inplace",
+      "Perform all required bufferization incantations to convert code with "
+      "Linalg ops on tensors to buffers with inplace optimizations.");
+}
+}  // namespace linalg
+}  // namespace mlir
+
+void LinalgComprehensiveBufferizePass::runOnOperation() {
+  ModuleOp module = getOperation();
+  DominanceInfo domInfo(module);
+  module.walk([&](CallOpInterface callOp) {
+    inplaceFunctionArgumentAnalysis(callOp, domInfo);
+  });
+
+  module.walk([&](FuncOp funcOp) { bufferizeFuncOpInternals(funcOp); });
+
+  // Recompute domInfo.
+  domInfo = DominanceInfo(module);
+  module.walk(
+      [&](CallOpInterface callOp) { bufferizeFunctionCall(callOp, domInfo); });
+  PassManager pm(module.getContext());
+  pm.addPass(createCanonicalizerPass());
+  (void)pm.run(module);
+
+  // Cleanups and sanity checks.
+  module.walk([&](Operation *op) {
+    op->removeAttr(kInPlaceAttrName);
+    op->removeAttr(kResultFoldArgAttrName);
+    if (auto tensorLoadOp = dyn_cast<TensorLoadOp>(op)) {
+      if (tensorLoadOp.memref().getDefiningOp<TensorToMemrefOp>()) {
+        op->getParentOfType<ModuleOp>()->dump();
+        op->emitWarning(
+            "Most likely incorrect pattern: tensor_load(tensor_to_memref)");
+        abort();
+      }
+    }
+    if (auto callOp = dyn_cast<CallOpInterface>(op)) {
+      for (auto result : callOp->getResults()) {
+        if (result.getType().isa<MemRefType>()) {
+          op->getParentOfType<ModuleOp>()->dump();
+          op->emitWarning(
+              "Most likely incorrect pattern: function returning memref -> "
+              "alloc needs to be hoisted out of function boundary");
+          abort();
+        }
+      }
+    }
+  });
+}

diff --git a/experimental/runners/LinalgTensorCodegenStrategy.cpp b/experimental/runners/LinalgTensorCodegenStrategy.cpp
new file mode 100644
index 0000000..09b21b7
--- /dev/null
+++ b/experimental/runners/LinalgTensorCodegenStrategy.cpp

@@ -0,0 +1,304 @@
+//===- LinalgTensorCodegenStrategyPass.cpp - Test Linalg codegen strategy--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements logic for testing the Linalg codegen strategy.
+//
+//===----------------------------------------------------------------------===//
+
+// TODO: avoid copy-pasta but I can't seem to be able to inherit from a pass.
+// Will get better once upstreamed to core and it replaces the existing codegen
+// strategy.
+
+#include "Transforms.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/Linalg/Transforms/CodegenStrategy.h"
+#include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/Linalg/Utils/Utils.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/Dialect/Vector/VectorOps.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+using namespace mlir;
+using namespace mlir::linalg;
+
+namespace {
+struct LinalgTensorCodegenStrategyPass
+    : public PassWrapper<LinalgTensorCodegenStrategyPass, FunctionPass> {
+  LinalgTensorCodegenStrategyPass() = default;
+  LinalgTensorCodegenStrategyPass(const LinalgTensorCodegenStrategyPass &pass) {
+  }
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    // clang-format off
+    registry.insert<AffineDialect,
+                    gpu::GPUDialect,
+                    linalg::LinalgDialect,
+                    scf::SCFDialect,
+                    StandardOpsDialect,
+                    vector::VectorDialect>();
+    // clang-format on
+  }
+
+  template <typename LinalgNamedOp>
+  void applyStrategyToNamedLinalgOp();
+
+  void runOnFunction() override;
+
+  ListOption<int64_t> tileSizes{*this, "tile-sizes",
+                                llvm::cl::MiscFlags::CommaSeparated,
+                                llvm::cl::desc("Specifies the tile sizes.")};
+  Option<bool> promote{
+      *this, "promote",
+      llvm::cl::desc("Promote the tile into a small aligned memory buffer."),
+      llvm::cl::init(false)};
+  Option<bool> promoteFullTile{
+      *this, "promote-full-tile-pad",
+      llvm::cl::desc("Pad the small aligned memory buffer to the tile sizes."),
+      llvm::cl::init(false)};
+  Option<bool> vectorize{
+      *this, "vectorize",
+      llvm::cl::desc("Rewrite the linalg op as a vector operation."),
+      llvm::cl::init(false)};
+  Option<std::string> splitVectorTransfersTo{
+      *this, "split-transfers",
+      llvm::cl::desc(
+          "Split vector transfers between slow (masked) and fast "
+          "(unmasked) variants. Possible options are:\n"
+          "\tnone: keep unsplit vector.transfer and pay the full price\n"
+          "\tlinalg-copy: use linalg.fill + linalg.copy for the slow path\n"
+          "\tvector-transfers: use extra small unmasked vector.transfer for"
+          " the slow path\n"),
+      llvm::cl::init("none")};
+  Option<std::string> vectorizeContractionTo{
+      *this, "vectorize-contraction-to",
+      llvm::cl::desc("the type of vector op to use for linalg contractions"),
+      llvm::cl::init("outerproduct")};
+  Option<bool> unrollVectorTransfers{
+      *this, "unroll-vector-transfers",
+      llvm::cl::desc("Enable full unrolling of vector.transfer operations"),
+      llvm::cl::init(false)};
+  Option<bool> licm{*this, "licm", llvm::cl::desc("Enable LICM."),
+                    llvm::cl::init(true)};
+  Option<bool> hoistRedundantVectorTransfers{
+      *this, "hoist-redundant-vector-transfers",
+      llvm::cl::desc("Enable HoistRedundantVectorTransfers"),
+      llvm::cl::init(true)};
+  Option<bool> vectorTransferPartialRewrite{
+      *this, "vector-transfer-partial-rewrite",
+      llvm::cl::desc("Enable rewriting of vector.transfer operations into "
+                     "full/partial read/writes."),
+      llvm::cl::init(true)};
+  Option<bool> vectorContractLowering{
+      *this, "vector-contract-lowering",
+      llvm::cl::desc("Enable lowering of vector contractions."),
+      llvm::cl::init(true)};
+  Option<bool> vectorToSCFConversion{
+      *this, "vector-to-scf-conversion",
+      llvm::cl::desc("Enable vector to scf conversions."),
+      llvm::cl::init(true)};
+  Option<std::string> anchorOpName{
+      *this, "anchor-op",
+      llvm::cl::desc(
+          "Which single linalg op is the anchor for the codegen strategy to "
+          "latch on:\n"
+          "\tlinalg.matmul: anchor on linalg.matmul\n"
+          "\tlinalg.matmul_column_major: anchor on linalg.matmul_column_major\n"
+          "\tlinalg.copy: anchor on linalg.copy\n"
+          "\tlinalg.fill: anchor on linalg.fill\n"),
+      llvm::cl::init("")};
+  Option<std::string> anchorFuncOpName{
+      *this, "anchor-func",
+      llvm::cl::desc(
+          "Which single func op is the anchor for the codegen strategy to "
+          "latch on."),
+      llvm::cl::init("")};
+
+  Option<bool> distribute{
+      *this, "distribute",
+      llvm::cl::desc("Distribute the linalg op into a TiledGeneric."),
+      llvm::cl::init(false)};
+  ListOption<int64_t> distributeTileSizes{
+      *this, "distribute-tile-sizes", llvm::cl::MiscFlags::CommaSeparated,
+      llvm::cl::desc("Specifies the tile sizes.")};
+  Option<bool> pad{*this, "pad", llvm::cl::desc("Use padding during tiling."),
+                   llvm::cl::init(false)};
+  Option<int> hoistPadding{
+      *this, "hoist-padding",
+      llvm::cl::desc("Hoist padding by the number of specified loops."),
+      llvm::cl::init(0)};
+  Option<bool> vectorizePadding{
+      *this, "vectorize-padding",
+      llvm::cl::desc("Rewrite linalg.pad_tensor in vector form."),
+      llvm::cl::init(false)};
+};
+}  // end anonymous namespace
+
+static void runStrategy(LinalgTensorCodegenStrategyPass &pass,
+                        CodegenStrategy &strategy) {
+  strategy.setEnableLICM(pass.licm)
+      .setEnableHoistRedundantVectorTransfers(
+          pass.hoistRedundantVectorTransfers)
+      .setEnableHoistRedundantVectorTransfersOnTensor(
+          pass.hoistRedundantVectorTransfers)
+      .setEnableVectorTransferPartialRewrite(pass.vectorTransferPartialRewrite)
+      .setEnableVectorContractLowering(pass.vectorContractLowering)
+      .setEnableVectorToSCFConversion(pass.vectorToSCFConversion)
+      .transform(pass.getFunction());
+}
+
+static void runGenericStrategy(
+    LinalgTensorCodegenStrategyPass &pass, LinalgTilingOptions tilingOptions,
+    vector::VectorContractLowering vectorContractLowering,
+    vector::VectorTransferSplit vectorTransferSplit) {
+  assert(!pass.anchorOpName.empty());
+  CodegenStrategy strategy;
+  strategy
+      .tileIf<LinalgOp>(!pass.tileSizes.empty(), pass.anchorOpName,
+                        tilingOptions)
+      .promoteIf<LinalgOp>(
+          pass.promote, pass.anchorOpName,
+          LinalgPromotionOptions()
+              .setAlignment(16)
+              .setUseFullTileBuffersByDefault(pass.promoteFullTile))
+      .vectorizeIf(pass.vectorize, pass.anchorOpName)
+      .setVectorTransformsOptions(
+          vector::VectorTransformsOptions()
+              .setVectorTransformsOptions(vectorContractLowering)
+              .setVectorTransferSplit(vectorTransferSplit))
+      .setVectorTransferToSCFOptions(
+          VectorTransferToSCFOptions().setUnroll(pass.unrollVectorTransfers));
+  runStrategy(pass, strategy);
+}
+
+template <typename OpType>
+static void runStrategy(LinalgTensorCodegenStrategyPass &pass,
+                        LinalgTilingOptions tilingOptions,
+                        vector::VectorContractLowering vectorContractLowering,
+                        vector::VectorTransferSplit vectorTransferSplit) {
+  CodegenStrategy strategy;
+  strategy.tileIf<OpType>(!pass.tileSizes.empty(), tilingOptions)
+      .template promoteIf<OpType>(
+          pass.promote,
+          LinalgPromotionOptions()
+              .setAlignment(16)
+              .setUseFullTileBuffersByDefault(pass.promoteFullTile))
+      .template vectorizeIf<OpType>(pass.vectorize)
+      .setVectorTransformsOptions(
+          vector::VectorTransformsOptions()
+              .setVectorTransformsOptions(vectorContractLowering)
+              .setVectorTransferSplit(vectorTransferSplit))
+      .setVectorTransferToSCFOptions(
+          VectorTransferToSCFOptions().setUnroll(pass.unrollVectorTransfers));
+  runStrategy(pass, strategy);
+}
+
+// For now, just assume it is the zero of type.
+// In the future, it should be the zero of type + op.
+static Value getNeutralOfLinalgOp(OpBuilder &b, OpOperand &op) {
+  auto t = getElementTypeOrSelf(op.get().getType());
+  return b.create<ConstantOp>(op.getOwner()->getLoc(), t, b.getZeroAttr(t));
+}
+
+/// Apply transformations specified as patterns.
+void LinalgTensorCodegenStrategyPass::runOnFunction() {
+  if (!anchorFuncOpName.empty() && anchorFuncOpName != getFunction().getName())
+    return;
+
+  if (distribute && !distributeTileSizes.empty()) {
+    LinalgTilingOptions tilingOptions;
+    tilingOptions = tilingOptions.setTileSizes(distributeTileSizes);
+    if (pad)
+      tilingOptions = tilingOptions.setPaddingValueComputationFunction(
+          getNeutralOfLinalgOp);
+    OwningRewritePatternList patterns;
+    patterns.insert<TileAndDistributePattern>(
+        TileAndDistributeOptions{tilingOptions},
+        LinalgTransformationFilter(
+            ArrayRef<Identifier>{},
+            {Identifier::get("distributed", getFunction().getContext())})
+            .addFilter([](Operation *op) {
+              return success(isaContractionOpInterface(dyn_cast<LinalgOp>(op)));
+            }));
+    (void)applyPatternsAndFoldGreedily(getFunction(), std::move(patterns));
+    // Ensure we drop the marker in the end.
+    getFunction().walk([](LinalgOp op) {
+      op.removeAttr(LinalgTransforms::kLinalgTransformMarker);
+    });
+  }
+
+  LinalgTilingOptions tilingOptions;
+  if (!tileSizes.empty()) tilingOptions = tilingOptions.setTileSizes(tileSizes);
+  if (pad)
+    tilingOptions =
+        tilingOptions.setPaddingValueComputationFunction(getNeutralOfLinalgOp);
+
+  vector::VectorContractLowering vectorContractLowering =
+      llvm::StringSwitch<vector::VectorContractLowering>(
+          vectorizeContractionTo.getValue())
+          .Case("matrixintrinsics", vector::VectorContractLowering::Matmul)
+          .Case("dot", vector::VectorContractLowering::Dot)
+          .Case("outerproduct", vector::VectorContractLowering::OuterProduct)
+          .Default(vector::VectorContractLowering::OuterProduct);
+  vector::VectorTransferSplit vectorTransferSplit =
+      llvm::StringSwitch<vector::VectorTransferSplit>(
+          splitVectorTransfersTo.getValue())
+          .Case("none", vector::VectorTransferSplit::None)
+          .Case("linalg-copy", vector::VectorTransferSplit::LinalgCopy)
+          .Case("vector-transfers", vector::VectorTransferSplit::VectorTransfer)
+          .Default(vector::VectorTransferSplit::None);
+
+  if (!anchorOpName.empty()) {
+    using RunFnType = decltype(&runGenericStrategy);
+    RunFnType runFn = &runGenericStrategy;
+    // `linalg::PadTensorOp::getOperationName()` is not a StringLiteral, cannot
+    // use StringSwitch.
+    if (anchorOpName == CopyOp::getOperationName())
+      runFn = &runStrategy<CopyOp>;
+    if (anchorOpName == FillOp::getOperationName())
+      runFn = &runStrategy<FillOp>;
+    if (anchorOpName == PadTensorOp::getOperationName())
+      runFn = &runStrategy<PadTensorOp>;
+    else if (anchorOpName == MatmulOp::getOperationName())
+      runFn = &runStrategy<MatmulOp>;
+    else if (anchorOpName == MatmulI8I8I32Op::getOperationName())
+      runFn = &runStrategy<MatmulI8I8I32Op>;
+    runFn(*this, tilingOptions, vectorContractLowering, vectorTransferSplit);
+  }
+
+  // Transforms that do not require anchoring on a given op.
+  if (hoistPadding > 0) {
+    SmallVector<PadTensorOp> ops;
+    getFunction().walk([&](PadTensorOp op) { ops.push_back(op); });
+    for (auto op : llvm::reverse(ops))
+      (void)hoistPaddingOnTensors(op, hoistPadding);
+  }
+  if (vectorizePadding) {
+    OwningRewritePatternList extraVectorizationPatterns;
+    extraVectorizationPatterns.insert<PadTensorOpVectorizationPattern>(
+        &getContext());
+    (void)applyPatternsAndFoldGreedily(getFunction(),
+                                       std::move(extraVectorizationPatterns));
+  }
+}
+
+namespace mlir {
+namespace linalg {
+void registerLinalgTensorCodegenStrategyPass() {
+  PassRegistration<LinalgTensorCodegenStrategyPass>
+      testLinalgCodegenStrategyPass("linalg-tensor-codegen-strategy",
+                                    "Linalg Tensor Codegen Strategy.");
+}
+}  // namespace linalg
+}  // namespace mlir

diff --git a/experimental/runners/LinalgTileToGeneric.cpp b/experimental/runners/LinalgTileToGeneric.cpp
new file mode 100644
index 0000000..4d1a5ce
--- /dev/null
+++ b/experimental/runners/LinalgTileToGeneric.cpp

@@ -0,0 +1,435 @@
+//===- LinalgTileToGenericPass.cpp - Tile and distribute to linalg.tile ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements logic for tile and distribute to a tiled nested linalg
+// abstraction.
+//
+//===----------------------------------------------------------------------===//
+
+#include <iterator>
+#include <memory>
+
+#include "Transforms.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/SCF/SCF.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Dominance.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeRange.h"
+#include "mlir/IR/UseDefLists.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "mlir/Transforms/Passes.h"
+#include "mlir/Transforms/RegionUtils.h"
+
+#define DEBUG_TYPE "distribute"
+
+#define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ")
+
+using namespace mlir;
+using namespace mlir::linalg;
+
+constexpr StringLiteral kTiledGenericOpName = "tiled_generic";
+constexpr StringLiteral kTiledGenericYieldOpName = "tiled_generic_yield";
+constexpr StringLiteral kTiledGenericPayloadOpName = "tiled_generic_payload";
+constexpr StringLiteral kTiledGenericPayloadYieldOpName =
+    "tiled_generic_payload_yield";
+
+mlir::linalg::TileAndDistributePattern::TileAndDistributePattern(
+    TileAndDistributeOptions options, LinalgTransformationFilter filter,
+    PatternBenefit benefit)
+    : RewritePattern(benefit, MatchAnyOpTypeTag()),
+      filter(filter),
+      options(options) {}
+
+mlir::linalg::TileAndDistributePattern::TileAndDistributePattern(
+    TileAndDistributeOptions options, StringRef opName, MLIRContext *context,
+    LinalgTransformationFilter filter, PatternBenefit benefit)
+    : RewritePattern(opName, {}, benefit, context),
+      filter(filter),
+      options(options) {}
+
+LogicalResult mlir::linalg::TileAndDistributePattern::matchAndRewrite(
+    Operation *op, PatternRewriter &rewriter) const {
+  LinalgOp linalgOp = dyn_cast<LinalgOp>(op);
+  if (!linalgOp || !linalgOp.hasTensorSemantics()) return failure();
+  if (failed(filter.checkAndNotify(rewriter, linalgOp))) return failure();
+
+  Optional<TileAndDistributedLinalgOp> res =
+      tileAndDistributeLinalgOp(rewriter, op, options);
+  if (!res) return failure();
+  if (res->tiledGenericOp->getNumResults() > 0)
+    rewriter.replaceOp(op, res->tiledGenericOp->getResults());
+  else
+    rewriter.eraseOp(op);
+  filter.replaceLinalgTransformationFilter(rewriter, res->tiledLinalgOp);
+  return success();
+}
+
+static bool isProducedByOneOf(Value v,
+                              llvm::SetVector<Operation *> &operations) {
+  Operation *def = v.getDefiningOp();
+  return def && operations.contains(def);
+}
+
+static bool hasAnyUseOutsideOf(Value v,
+                               llvm::SetVector<Operation *> &operations) {
+  return llvm::any_of(v.getUsers(),
+                      [&](Operation *op) { return !operations.contains(op); });
+}
+
+static LinalgOp outline(PatternRewriter &rewriter, LinalgOp tiledRootOp,
+                        llvm::SetVector<Operation *> &operations,
+                        Operation *tiledGenericPayload) {
+  LinalgOp tiledRootOpClone;
+
+  Region &targetRegion = tiledGenericPayload->getRegion(0);
+  Block *block = &targetRegion.front();
+
+  OpBuilder::InsertionGuard g(rewriter);
+  rewriter.setInsertionPointToStart(block);
+  BlockAndValueMapping bvm;
+  bvm.map(tiledGenericPayload->getOperands(), block->getArguments());
+  llvm::SetVector<Operation *> blockOperations;
+  for (Operation *op : operations) {
+    Operation *cloned = rewriter.clone(*op, bvm);
+    blockOperations.insert(cloned);
+    if (op == tiledRootOp) tiledRootOpClone = cast<LinalgOp>(cloned);
+  }
+
+  llvm::SmallVector<Value> valuesToYield;  //(tiledRootOp->getResults());
+  // Iterate over operations: if any result of any operation is used outside of
+  // operations, it would leak outside of the region from blockOperations.
+  // We need to yield such values outside the region.
+  // TODO: we prob need to determine that set of values outside so we can do a
+  // proper RAUW.
+  for (auto it : llvm::zip(operations, blockOperations))
+    for (OpResult result : std::get<0>(it)->getResults())
+      if (hasAnyUseOutsideOf(result, operations))
+        valuesToYield.push_back(
+            std::get<1>(it)->getResult(result.getResultNumber()));
+
+#if 1
+  rewriter.create<linalg::YieldOp>(blockOperations.front()->getLoc(),
+                                   valuesToYield);
+#else
+  OperationState state(tiledRootOpClone->getLoc(), kTiledGenericYieldOpName);
+  state.addOperands(valuesToYield);
+  state.addTypes(ValueRange{valuesToYield}.getTypes());
+  state.addAttribute(
+      "TODO",
+      rewriter.getStringAttr(
+          "should become linalg.yield_part %partial_tensor, %whole_tensor: "
+          "(partial_tensor_t) -> (whole_tensor_t) where %whole_tensor must "
+          "be "
+          "`subtensor_insert %partial_tensor into ...`"));
+#endif
+
+  return tiledRootOpClone;
+}
+
+static TileAndDistributedLinalgOp buildTiledGenericPayload(
+    PatternRewriter &rewriter, Operation *tiledGenericOp,
+    LinalgOp tiledRootOp) {
+  llvm::SetVector<Operation *> backwardSlice;
+  // Get the backward slice limited by SubTensor ops and properly nested under
+  // tiledGenericOp.
+  getBackwardSlice(tiledRootOp, &backwardSlice, [&](Operation *op) {
+    return !isa<SubTensorOp>(op) && tiledGenericOp->isProperAncestor(op);
+  });
+  backwardSlice.insert(tiledRootOp);
+
+  // Compute used values defined outside of `operations` and use them to clone
+  // in a new block.
+  llvm::SetVector<Value> valuesFromOutside;
+  for (Operation *op : backwardSlice)
+    for (Value v : op->getOperands())
+      if (!isProducedByOneOf(v, backwardSlice)) valuesFromOutside.insert(v);
+
+  OperationState state(tiledRootOp->getLoc(), kTiledGenericPayloadOpName);
+  state.addOperands(valuesFromOutside.getArrayRef());
+  state.addTypes(tiledRootOp->getResultTypes());
+  Region *region = state.addRegion();
+  rewriter.createBlock(region, region->begin(),
+                       ValueRange{valuesFromOutside.getArrayRef()}.getTypes());
+
+  OpBuilder::InsertionGuard g(rewriter);
+  rewriter.setInsertionPointAfter(tiledRootOp);
+  Operation *tiledGenericPayload = rewriter.createOperation(state);
+  LinalgOp tiledRootOpClone =
+      outline(rewriter, tiledRootOp, backwardSlice, tiledGenericPayload);
+  rewriter.replaceOp(tiledRootOp, tiledGenericPayload->getResults());
+
+  // Erase the slice except tiledRootOp which was already replaced.
+  backwardSlice.erase(std::prev(backwardSlice.end()));
+  for (Operation *op : llvm::reverse(backwardSlice))
+    if (op != tiledRootOp) rewriter.eraseOp(op);
+
+  (void)simplifyRegions(tiledGenericPayload->getRegions());
+
+  return TileAndDistributedLinalgOp{tiledGenericOp, tiledGenericPayload,
+                                    tiledRootOpClone};
+}
+
+//
+static void getUsedValuesDefinedOutsideOfLoopNest(
+    llvm::SetVector<Operation *> loopNest,
+    llvm::SetVector<Value> &valuesFromAbove) {
+  scf::ForOp outerLoop = cast<scf::ForOp>(loopNest.front());
+  scf::ForOp innerLoop = cast<scf::ForOp>(loopNest.back());
+  innerLoop->walk([&](Operation *op) {
+    for (OpOperand &operand : op->getOpOperands()) {
+      // Values or BBArgs defined by an op outside of the loop nest.
+      if (auto opResult = operand.get().dyn_cast<OpResult>()) {
+        if (!outerLoop->isAncestor(opResult.getDefiningOp()))
+          valuesFromAbove.insert(operand.get());
+        continue;
+      }
+      if (auto bbArg = operand.get().dyn_cast<BlockArgument>()) {
+        if (!outerLoop->isAncestor(bbArg.getOwner()->getParentOp()))
+          valuesFromAbove.insert(operand.get());
+        continue;
+      }
+      // Unexpected cases.
+      llvm_unreachable("unexpected type of Value");
+    }
+  });
+}
+
+static void simplifyTensorGenericOpRegion(PatternRewriter &rewriter,
+                                          Region &region,
+                                          llvm::SetVector<int> keep) {
+  Block *b = &region.front();
+  Block *newBlock = rewriter.createBlock(&region, std::next(region.begin()));
+  SmallVector<Value> argsRepl, newArgs;
+  for (auto en : llvm::enumerate(b->getArguments())) {
+    unsigned idx = en.index();
+    BlockArgument bbarg = en.value();
+    if (keep.contains(idx)) {
+      newBlock->addArgument(bbarg.getType());
+      argsRepl.push_back(newBlock->getArguments().back());
+    } else {
+      argsRepl.push_back(nullptr);
+    }
+  }
+  rewriter.mergeBlocks(b, newBlock, argsRepl);
+}
+
+// TODO: use a map instead of linear scan when it matters.
+template <typename ValueContainerType>
+int lookupIndex(ValueContainerType &vector, Value target) {
+  int pos = 0;
+  for (Value v : vector) {
+    if (target == v) return pos;
+    ++pos;
+  }
+  return -1;
+}
+
+static void canonicalizeTensorGenericOp(PatternRewriter &rewriter,
+                                        Operation *tiledGenericOp) {
+  int64_t numLoops =
+      tiledGenericOp->getAttr("num_loops_attr").cast<IntegerAttr>().getInt();
+  unsigned numControlOperands = 3 * numLoops;
+  unsigned numControlBlockArguments = numLoops;
+
+  Block *block = &tiledGenericOp->getRegion(0).front();
+  OpBuilder::InsertionGuard g(rewriter);
+  rewriter.setInsertionPointToStart(block);
+  llvm::SetVector<Value> canonicalizedOperands;
+  canonicalizedOperands.insert(
+      tiledGenericOp->getOperands().begin(),
+      tiledGenericOp->getOperands().begin() + numControlOperands);
+  // Keep control bbArgs.
+  llvm::SetVector<int> bbArgIdxToKeep;
+  auto rangeNumControlBlockArguments =
+      llvm::seq<int>(0, numControlBlockArguments);
+  bbArgIdxToKeep.insert(rangeNumControlBlockArguments.begin(),
+                        rangeNumControlBlockArguments.end());
+  for (int idx = 0, e = tiledGenericOp->getNumOperands() - numControlOperands;
+       idx < e; ++idx) {
+    OpOperand &operand = tiledGenericOp->getOpOperand(numControlOperands + idx);
+    BlockArgument bbArg = block->getArgument(numControlBlockArguments + idx);
+    // Just drop bbargs without uses.
+    if (bbArg.use_empty()) continue;
+
+    if (canonicalizedOperands.contains(operand.get())) {
+      int duplicateIdx = lookupIndex(canonicalizedOperands, operand.get());
+      LLVM_DEBUG(DBGS() << "Duplicate: " << canonicalizedOperands[duplicateIdx]
+                        << "\n");
+      bbArg.replaceAllUsesWith(block->getArgument(
+          duplicateIdx - numControlOperands + numControlBlockArguments));
+      continue;
+    }
+
+    // Just pull constants in.
+    if (Operation *constantOp = operand.get().getDefiningOp<ConstantOp>()) {
+      LLVM_DEBUG(DBGS() << "Drop: " << *constantOp << "\n");
+      bbArg.replaceAllUsesWith(rewriter.clone(*constantOp)->getResult(0));
+      continue;
+    }
+
+    canonicalizedOperands.insert(operand.get());
+    bbArgIdxToKeep.insert(numControlBlockArguments + idx);
+  }
+
+  simplifyTensorGenericOpRegion(rewriter, tiledGenericOp->getRegion(0),
+                                bbArgIdxToKeep);
+  tiledGenericOp->setOperands(canonicalizedOperands.getArrayRef());
+}
+
+static TileAndDistributedLinalgOp buildTiledGenericOp(
+    PatternRewriter &rewriter, TiledLinalgOp &&tiledLinalgOp) {
+  Location loc = tiledLinalgOp.op->getLoc();
+  SmallVector<Value> lbs, ubs, steps, ivs;
+  for (Operation *loop : tiledLinalgOp.loops) {
+    scf::ForOp forOp = cast<scf::ForOp>(loop);
+    lbs.push_back(forOp.lowerBound());
+    ubs.push_back(forOp.upperBound());
+    steps.push_back(forOp.step());
+    ivs.push_back(forOp.getInductionVar());
+  }
+
+  auto outerLoop = cast<scf::ForOp>(tiledLinalgOp.loops.front());
+  auto innerLoop = cast<scf::ForOp>(tiledLinalgOp.loops.back());
+  llvm::SetVector<Value> valuesFromAbove;
+  llvm::SetVector<Operation *> loopNest(tiledLinalgOp.loops.begin(),
+                                        tiledLinalgOp.loops.end());
+  getUsedValuesDefinedOutsideOfLoopNest(loopNest, valuesFromAbove);
+
+  OperationState state(loc, kTiledGenericOpName);
+  Region *region = state.addRegion();
+  Block *block = new Block();
+  region->push_back(block);
+
+  // Results of TiledGenericOp comprise:
+  //   1. the results of the outermost loop.
+  state.addTypes(tiledLinalgOp.loops.front()->getResultTypes());
+
+  // Operands of TiledGenericOp comprise:
+  //   1. lbs/ubs/steps to reform loops.
+  //   2. valuesFromAbove (TODO: filter out ivs and lbs/ubs/steps).
+  state.addOperands(lbs);
+  state.addOperands(ubs);
+  state.addOperands(steps);
+  // Assume that the outerLoop iter operands match the innerLoop bb iter args.
+  // This is a property guaranteed by tileAndFuse on tensors.
+  // In the future we may want to just directly emit TiledGenericOp to avoid
+  // this assumption.
+  state.addOperands(outerLoop.getIterOperands());
+  state.addOperands(valuesFromAbove.getArrayRef());
+  state.addAttribute("num_loops_attr", rewriter.getI32IntegerAttr(lbs.size()));
+
+  // BBArgs of TiledGenericOp comprise:
+  //   1. indices for each iterator (i.e. the IVs for all loops)
+  //   2. the of the innermost loop.
+  //   3. valuesFromAbove (TODO: filter out ivs and lbs/ubs/steps).
+  SmallVector<Value> allValues;
+  llvm::append_range(allValues, ivs);
+  // Assume that the outerLoop iter operands match the innerLoop bb iter args.
+  // This is a property guaranteed by tileAndFuse on tensors.
+  // In the future we may want to just directly emit TiledGenericOp to avoid
+  // this assumption.
+  llvm::append_range(allValues, innerLoop.getRegionIterArgs());
+  llvm::append_range(allValues, valuesFromAbove);
+  block->addArguments(ValueRange{allValues}.getTypes());
+
+  // TODO: handle ops in-between [inner, outer] loops (e.g. sink loop-invariants
+  // and/or handle non-hyperrectangular cases).
+  // In general, this will need an extra outlined function.
+  // For best amortization, we will need one such function per dimension.
+  // This is related to directly emitting TiledGenericOp to avoid this
+  // assumption.
+
+  // Propagate bbargs in the block before creating the TiledGeneric op.
+  // We capture more than the innerLoop did and we cannot rely on the 1-1
+  // replacement provided by `mergeBlocks`.
+  for (auto it : llvm::zip(allValues, block->getArguments())) {
+    std::get<0>(it).replaceUsesWithIf(std::get<1>(it), [&](OpOperand &operand) {
+      return innerLoop->isProperAncestor(operand.getOwner());
+    });
+  }
+  Block &innerLoopBlock = innerLoop->getRegion(0).front();
+  assert(llvm::all_of(innerLoopBlock.getArguments(),
+                      [](BlockArgument bbarg) { return bbarg.use_empty(); }));
+  // Steal the ops and replace the loop nest by a new TileGenericOp.
+  block->getOperations().splice(block->end(), innerLoopBlock.getOperations());
+
+  Operation *tiledGenericOp = rewriter.createOperation(state);
+  rewriter.replaceOp(tiledLinalgOp.loops.front(), tiledGenericOp->getResults());
+  Operation *terminatorOp =
+      tiledGenericOp->getRegion(0).front().getTerminator();
+
+  OpBuilder::InsertionGuard g(rewriter);
+  rewriter.setInsertionPoint(terminatorOp);
+#if 1
+  rewriter.replaceOpWithNewOp<linalg::YieldOp>(terminatorOp,
+                                               terminatorOp->getOperands());
+#else
+  OperationState state(loc, kTiledGenericPayloadYieldOpName);
+  state.addOperands(terminatorOp->getOperands());
+  state.addTypes(terminatorOp->getResultTypes());
+  state.addAttribute(
+      "TODO",
+      rewriter.getStringAttr(
+          "should become linalg.yield_part %partial_tensor, %whole_tensor: "
+          "(partial_tensor_t) -> (whole_tensor_t) where %whole_tensor must "
+          "be "
+          "`subtensor_insert %partial_tensor into ...`"));
+#endif
+
+  LLVM_DEBUG(DBGS() << "Pre-cleanup TiledGenericOp\n " << *tiledGenericOp);
+  canonicalizeTensorGenericOp(rewriter, tiledGenericOp);
+  LLVM_DEBUG(DBGS() << "Post-cleanup TiledGenericOp\n " << *tiledGenericOp);
+
+  return buildTiledGenericPayload(rewriter, tiledGenericOp, tiledLinalgOp.op);
+}
+
+Optional<TileAndDistributedLinalgOp> mlir::linalg::tileAndDistributeLinalgOp(
+    PatternRewriter &rewriter, LinalgOp linalgOp,
+    const TileAndDistributeOptions &options) {
+  auto tiledLinalgOp = tileLinalgOp(rewriter, linalgOp, options.tilingOptions);
+  if (!tiledLinalgOp) return llvm::None;
+  linalg::fuseProducerOfTensor(rewriter,
+                               linalgOp.getOutputOpOperands()
+                                   .front()
+                                   .get()
+                                   .getDefiningOp()
+                                   ->getResults()
+                                   .front(),
+                               tiledLinalgOp->op.getOutputOpOperands().front());
+
+  // Consider padding on the fly only if the op has tensor semantics.
+  if (!options.tilingOptions.paddingValueComputationFunction ||
+      !linalgOp.hasTensorSemantics())
+    return buildTiledGenericOp(rewriter, std::move(*tiledLinalgOp));
+
+  // Try to pad on the fly by rewriting tiledLinalgOp->op as a padded op.
+  // TODO: This requires padding and bounding box to symbolic multiples.
+  // (void)rewriteAsPaddedOp(rewriter, *tiledLinalgOp, options.tilingOptions);
+
+  return buildTiledGenericOp(rewriter, std::move(*tiledLinalgOp));
+}

diff --git a/experimental/runners/README.md b/experimental/runners/README.md
new file mode 100644
index 0000000..02777e3
--- /dev/null
+++ b/experimental/runners/README.md

@@ -0,0 +1,5 @@
+# Sandbox for Linalg on Tensors
+
+The purpose of this directory is to expose a minimal end-to-end path for
+targetable codegen with linalg on tensors. This acts as a staging area for
+experimenting with new concepts until they mature and graduate to core.

diff --git a/experimental/runners/Transforms.h b/experimental/runners/Transforms.h
new file mode 100644
index 0000000..37ddf99
--- /dev/null
+++ b/experimental/runners/Transforms.h

@@ -0,0 +1,68 @@
+//===- Transforms.h - Custom Transforms: TileGeneric+Bufferize --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef IREE_EXPERIMENTAL_RUNNERS_TRANSFORMS_H_
+#define IREE_EXPERIMENTAL_RUNNERS_TRANSFORMS_H_
+
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/Linalg/Utils/Utils.h"
+#include "mlir/Dialect/Vector/VectorOps.h"
+#include "mlir/IR/Identifier.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/Bufferize.h"
+
+namespace mlir {
+class BufferizeTypeConverter;
+class FrozenRewritePatternList;
+
+namespace linalg {
+
+/// Specific pass and options to target a tiled and distributed nested linalg
+/// abstraction.
+struct TileAndDistributeOptions {
+  LinalgTilingOptions tilingOptions;
+};
+
+struct TileAndDistributedLinalgOp {
+  Operation *tiledGenericOp;
+  Operation *tiledInnerGenericOp;
+  LinalgOp tiledLinalgOp;
+  TileAndDistributedLinalgOp &operator=(const TileAndDistributedLinalgOp &) =
+      default;
+};
+
+Optional<TileAndDistributedLinalgOp> tileAndDistributeLinalgOp(
+    PatternRewriter &rewriter, LinalgOp op,
+    const TileAndDistributeOptions &options);
+
+struct TileAndDistributePattern : public RewritePattern {
+  /// MatchAnyOpTag-based constructor with a mandatory `filter`.
+  TileAndDistributePattern(TileAndDistributeOptions options,
+                           LinalgTransformationFilter filter,
+                           PatternBenefit benefit = 1);
+  /// Name-based constructor with an optional `filter`.
+  TileAndDistributePattern(
+      TileAndDistributeOptions options, StringRef opName, MLIRContext *context,
+      LinalgTransformationFilter filter = LinalgTransformationFilter(),
+      PatternBenefit benefit = 1);
+  LogicalResult matchAndRewrite(Operation *op,
+                                PatternRewriter &rewriter) const override;
+
+ private:
+  /// LinalgTransformMarker handles special attribute manipulations.
+  LinalgTransformationFilter filter;
+  /// Options.
+  TileAndDistributeOptions options;
+};
+
+}  // namespace linalg
+}  // namespace mlir
+
+#endif  // IREE_EXPERIMENTAL_RUNNERS_TRANSFORMS_H_

diff --git a/experimental/runners/mlir-opt.cpp b/experimental/runners/mlir-opt.cpp
new file mode 100644
index 0000000..feee3ab
--- /dev/null
+++ b/experimental/runners/mlir-opt.cpp

@@ -0,0 +1,55 @@
+//===- mlir-opt.cpp - MLIR Optimizer Driver -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Main entry function for mlir-opt for when built as standalone binary.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "mlir/IR/AsmState.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/InitAllDialects.h"
+#include "mlir/InitAllPasses.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Support/FileUtilities.h"
+#include "mlir/Support/MlirOptMain.h"
+
+using namespace llvm;
+using namespace mlir;
+using namespace mlir::linalg;
+
+// Defined in the runners directory, no public header.
+namespace mlir {
+namespace linalg {
+void registerLinalgComprehensiveBufferizePass();
+void registerLinalgTensorCodegenStrategyPass();
+}  // namespace linalg
+}  // namespace mlir
+
+void registerCustomPasses() {
+  registerLinalgComprehensiveBufferizePass();
+  registerLinalgTensorCodegenStrategyPass();
+}
+
+int main(int argc, char **argv) {
+  llvm::InitLLVM y(argc, argv);
+  registerAllPasses();
+  registerCustomPasses();
+
+  DialectRegistry registry;
+  registerAllDialects(registry);
+
+  return failed(MlirOptMain(argc, argv, "MLIR modular optimizer driver\n",
+                            registry,
+                            /*preloadDialectsInContext=*/false));
+}

diff --git a/experimental/runners/test/matmul_f32_base.mlir b/experimental/runners/test/matmul_f32_base.mlir
new file mode 100644
index 0000000..052ecea
--- /dev/null
+++ b/experimental/runners/test/matmul_f32_base.mlir

@@ -0,0 +1,77 @@
+!elem_type_a = type f32
+!elem_type_b = type f32
+!elem_type_c = type f32
+!row_major_A = type tensor<${M}x${K}x!elem_type_a>
+!row_major_B = type tensor<${K}x${N}x!elem_type_b>
+!row_major_C = type tensor<${M}x${N}x!elem_type_c>
+
+func @init_and_matmul(%a: !row_major_A, %b: !row_major_B, %c: !row_major_C) -> !row_major_C
+// TODO: activate manually for now.
+// attributes { passthrough = [["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]}
+{
+  %v0 = constant 0.0 : !elem_type_c
+  %d = linalg.fill(%c, %v0) : !row_major_C, !elem_type_c -> !row_major_C
+  %e = linalg.matmul ins(%a, %b : !row_major_A, !row_major_B)
+    outs(%d: !row_major_C) -> !row_major_C
+  return %e : !row_major_C
+}
+
+func @print_perf(%iters: index, %total_time: f64) {
+  %c2 = constant 2 : index
+  %cM = constant ${M} : index
+  %cN = constant ${N} : index
+  %cK = constant ${K} : index
+
+  %mn = muli %cM, %cN : index
+  %mnk = muli %mn, %cK : index
+
+  // 2*M*N*K.
+  %flops_per_iter = muli %c2, %mnk : index
+  %flops = muli %iters, %flops_per_iter : index
+  %flops_i64 = index_cast %flops : index to i64
+  %flops_f = sitofp %flops_i64 : i64 to f64
+  %flops_per_s = divf %flops_f, %total_time : f64
+  vector.print %flops_per_s : f64
+
+  return
+}
+
+func @main() {
+  %v0 = constant 0.0 : !elem_type_c
+  %v1 = constant 1.0 : !elem_type_a
+  %v2 = constant 2.0 : !elem_type_b
+
+  %A = linalg.init_tensor [${M}, ${K}] : !row_major_A
+  %B = linalg.init_tensor [${K}, ${N}] : !row_major_B
+  %C = linalg.init_tensor [${M}, ${N}] : !row_major_C
+  %AA = linalg.fill(%A, %v1) : !row_major_A, !elem_type_a -> !row_major_A
+  %BB = linalg.fill(%B, %v2) : !row_major_B, !elem_type_b -> !row_major_B
+  %CC = linalg.fill(%C, %v0) : !row_major_C, !elem_type_c -> !row_major_C
+
+  %c0 = constant 0: index
+  %c1 = constant 1: index
+  %iters = constant ${ITERS}: index
+
+  /// Run and dump performance for matmul.
+  %t_start_matmul = call @rtclock() : () -> f64
+  %res = scf.for %arg0 = %c0 to %iters step %c1 iter_args(%dummy = %CC) -> (!row_major_C) {
+    %r = call @init_and_matmul(%AA, %BB, %CC) : (!row_major_A, !row_major_B, !row_major_C) -> (!row_major_C)
+    scf.yield %r : !row_major_C
+  }
+  %t_end_matmul = call @rtclock() : () -> f64
+  %tmatmul = subf %t_end_matmul, %t_start_matmul: f64
+  call @print_perf(%iters, %tmatmul) : (index, f64) -> ()
+
+  %val = vector.transfer_read %res[%c0, %c0], %v0: !row_major_C, vector<1x1x!elem_type_c>
+  vector.print %val: vector<1x1x!elem_type_c>
+
+  // %unrankedRes = tensor.cast %res : !row_major_C to tensor<*x!elem_type_c>
+  // call @print_memref_f32(%unrankedRes) : (tensor<*x!elem_type_c>) -> ()
+
+  return
+}
+
+func private @rtclock() -> f64
+
+// Abuse reliance on conversions by allowing `tensor<*xf32>`.
+// func private @print_memref_f32(tensor<*xf32>) attributes { llvm.emit_c_interface }

diff --git a/experimental/runners/test/matmul_i8_i8_i32_base.mlir b/experimental/runners/test/matmul_i8_i8_i32_base.mlir
new file mode 100644
index 0000000..91bb54b
--- /dev/null
+++ b/experimental/runners/test/matmul_i8_i8_i32_base.mlir

@@ -0,0 +1,71 @@
+!elem_type_a = type i8
+!elem_type_b = type i8
+!elem_type_c = type i32
+!row_major_A = type tensor<${M}x${K}x!elem_type_a>
+!row_major_B = type tensor<${K}x${N}x!elem_type_b>
+!row_major_C = type tensor<${M}x${N}x!elem_type_c>
+
+func @matmul(%a: !row_major_A, %b: !row_major_B, %c: !row_major_C) -> !row_major_C
+// TODO: activate manually for now.
+// attributes { passthrough = [["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]}
+{
+  %v0 = constant 0 : !elem_type_c
+  %d = linalg.fill(%c, %v0) : !row_major_C, !elem_type_c -> !row_major_C
+  %e = linalg.matmul_i8_i8_i32 ins(%a, %b : !row_major_A, !row_major_B)
+    outs(%d: !row_major_C) -> !row_major_C
+  return %e : !row_major_C
+}
+
+func @print_perf(%iters: index, %total_time: f64) {
+  %c2 = constant 2 : index
+  %cM = constant ${M} : index
+  %cN = constant ${N} : index
+  %cK = constant ${K} : index
+
+  %mn = muli %cM, %cN : index
+  %mnk = muli %mn, %cK : index
+
+  // 2*M*N*K.
+  %flops_per_iter = muli %c2, %mnk : index
+  %flops = muli %iters, %flops_per_iter : index
+  %flops_i64 = index_cast %flops : index to i64
+  %flops_f = sitofp %flops_i64 : i64 to f64
+  %flops_per_s = divf %flops_f, %total_time : f64
+  vector.print %flops_per_s : f64
+
+  return
+}
+
+func @main() {
+  %v0 = constant 0 : !elem_type_c
+  %v1 = constant 1 : !elem_type_a
+
+  %A = linalg.init_tensor [${M}, ${K}] : !row_major_A
+  %B = linalg.init_tensor [${K}, ${N}] : !row_major_B
+  %C = linalg.init_tensor [${M}, ${N}] : !row_major_C
+  %AA = linalg.fill(%A, %v1) : !row_major_A, !elem_type_a -> !row_major_A
+  %BB = linalg.fill(%B, %v1) : !row_major_B, !elem_type_b -> !row_major_B
+  %CC = linalg.fill(%C, %v0) : !row_major_C, !elem_type_c -> !row_major_C
+
+  %c0 = constant 0: index
+  %c1 = constant 1: index
+  %iters = constant ${ITERS}: index
+
+  /// Run and dump performance for matmul.
+  %t_start_matmul = call @rtclock() : () -> f64
+  %res = scf.for %arg0 = %c0 to %iters step %c1 iter_args(%dummy = %CC) -> (!row_major_C) {
+    %r = call @matmul(%AA, %BB, %CC) : (!row_major_A, !row_major_B, !row_major_C) -> (!row_major_C)
+    scf.yield %r : !row_major_C
+  }
+  %t_end_matmul = call @rtclock() : () -> f64
+  %tmatmul = subf %t_end_matmul, %t_start_matmul: f64
+  call @print_perf(%iters, %tmatmul) : (index, f64) -> ()
+
+  %val = vector.transfer_read %res[%c0, %c0], %v0: !row_major_C, vector<1x1x!elem_type_c>
+  // CHECK: 64
+  vector.print %val: vector<1x1x!elem_type_c>
+
+  return
+}
+
+func private @rtclock() -> f64

diff --git a/experimental/runners/test/test_matmul_f32_000.mlir b/experimental/runners/test/test_matmul_f32_000.mlir
new file mode 100644
index 0000000..88baaa6
--- /dev/null
+++ b/experimental/runners/test/test_matmul_f32_000.mlir

@@ -0,0 +1,32 @@
+// RUN: export M=128 && export N=128 && export K=128 && export ITERS=10 && \
+// RUN: cat %p/matmul_f32_base.mlir | sed 's@${M}@'"$M"'@g'| sed 's@${K}@'"$K"'@g' | sed 's@${N}@'"$N"'@g'| sed 's@${ITERS}@'"$ITERS"'@g' |\
+
+// RUN: mlir-proto-opt -linalg-comprehensive-bufferize-inplace |\
+// RUN: tee | FileCheck %s
+
+// CHECK-LABEL: func @init_and_matmul(
+//  CHECK-SAME:       %[[A:[0-9a-zA-Z]+]]: memref<
+//  CHECK-SAME:       %[[B:[0-9a-zA-Z]+]]: memref<
+//  CHECK-SAME:       %[[C:[0-9a-zA-Z]+]]: memref<
+//       CHECK:   constant 0.0
+
+// Analysis kicks in, we can write in %[[C]] and no spurious alloc/copies are inserted.
+//  CHECK-NEXT:   linalg.fill(%[[C]], %{{.*}}) : memref<128x128xf32>, f32
+//  CHECK-NEXT:   linalg.matmul ins(%[[A]], %[[B]] : memref<128x128xf32>, memref<128x128xf32>) outs(%[[C]] : memref<128x128xf32>)
+//  CHECK-NEXT:   return
+
+// CHECK-LABEL: func @main(
+//   CHECK-DAG:   %[[f0:.*]] = constant 0.0
+//   CHECK-DAG:   %[[f1:.*]] = constant 1.0
+//   CHECK-DAG:   %[[f2:.*]] = constant 2.0
+//   CHECK-DAG:   alloc() : memref<128x128xf32>
+//   CHECK-DAG:   alloc() : memref<128x128xf32>
+//   CHECK-DAG:   alloc() : memref<128x128xf32>
+//   CHECK-DAG:   linalg.fill(%[[A:.*]], %[[f1]]) : memref<128x128xf32>, f32
+//   CHECK-DAG:   linalg.fill(%[[B:.*]], %[[f2]]) : memref<128x128xf32>, f32
+//   CHECK-DAG:   linalg.fill(%[[C:.*]], %[[f0]]) : memref<128x128xf32>, f32
+//       CHECK:   call @rtclock() : () -> f64
+//  CHECK-NEXT:   scf.for %{{.*}} {
+//  CHECK-NEXT:     call @init_and_matmul(%[[A]], %[[B]], %[[C]]) : (memref<128x128xf32>, memref<128x128xf32>, memref<128x128xf32>) -> ()
+//  CHECK-NEXT:   }
+//  CHECK-NEXT:   call @rtclock() : () -> f64

diff --git a/experimental/runners/test/test_matmul_f32_000_exec.mlir b/experimental/runners/test/test_matmul_f32_000_exec.mlir
new file mode 100644
index 0000000..dea750c
--- /dev/null
+++ b/experimental/runners/test/test_matmul_f32_000_exec.mlir

@@ -0,0 +1,12 @@
+// RUN: export M=128 && export N=128 && export K=128 && export ITERS=1 &&\
+// RUN: cat %p/matmul_f32_base.mlir | sed 's@${M}@'"$M"'@g'| sed 's@${K}@'"$K"'@g' | sed 's@${N}@'"$N"'@g'| sed 's@${ITERS}@'"$ITERS"'@g' |\
+
+// RUN: mlir-proto-opt -linalg-comprehensive-bufferize-inplace |\
+// RUN: mlir-opt -convert-vector-to-scf -lower-affine -convert-linalg-to-loops |\
+// RUN: mlir-opt -canonicalize -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \
+
+// RUN: mlir-cpu-runner -O3 -e main -entry-point-result=void \
+// RUN:   -shared-libs=%iree_runners_test_dir/libruntime-support%shlibext |\
+// RUN: FileCheck %s
+
+// CHECK: ( ( 256 ) )

diff --git a/experimental/runners/test/test_matmul_f32_001.mlir b/experimental/runners/test/test_matmul_f32_001.mlir
new file mode 100644
index 0000000..14e28c9
--- /dev/null
+++ b/experimental/runners/test/test_matmul_f32_001.mlir

@@ -0,0 +1,32 @@
+// RUN: export M=128 && export N=128 && export K=128 && export ITERS=10 &&\
+// RUN: cat %p/matmul_f32_base.mlir | sed 's@${M}@'"$M"'@g'| sed 's@${K}@'"$K"'@g' | sed 's@${N}@'"$N"'@g'| sed 's@${ITERS}@'"$ITERS"'@g' |\
+
+// RUN: mlir-proto-opt -linalg-tensor-codegen-strategy="anchor-func=init_and_matmul anchor-op=linalg.matmul tile-sizes=4,8,16 vectorize vector-contract-lowering=false" |\
+// RUN: mlir-proto-opt -linalg-comprehensive-bufferize-inplace |\
+// RUN: tee | FileCheck %s
+
+// CHECK-LABEL: func @init_and_matmul(
+//  CHECK-SAME:       %[[A:[0-9a-zA-Z]+]]: memref<
+//  CHECK-SAME:       %[[B:[0-9a-zA-Z]+]]: memref<
+//  CHECK-SAME:       %[[C:[0-9a-zA-Z]+]]: memref<
+//       CHECK:   constant 0.0
+//   CHECK-NOT:   alloc
+//       CHECK:   linalg.fill(%[[C]], %{{.*}}) : memref<128x128xf32>, f32
+//   CHECK-NOT:   copy
+//       CHECK:   scf.for %[[I:.*]] =
+//       CHECK:     scf.for %[[J:.*]] =
+//       CHECK:       %[[SVC:.*]] = subview %[[C]]{{.*}} : memref<128x128xf32> to memref<4x8xf32
+//       CHECK:       %[[VC:.*]] = vector.transfer_read %[[SVC]]{{.*}}{masked = [false, false]} : memref<4x8xf32{{.*}}>, vector<4x8xf32>
+//       CHECK:       scf.for %[[K:.*]] = {{.*}} iter_args(%{{.*}} = %[[VC]]) -> (vector<4x8xf32>)
+//       CHECK:         %[[SVA:.*]] = subview %[[A]][%[[I]], %[[K]]] [4, 16] [1, 1] : memref<128x128xf32> to memref<4x16xf32
+//       CHECK:         %[[SVB:.*]] = subview %[[B]][%[[K]], %[[J]]] [16, 8] [1, 1] : memref<128x128xf32> to memref<16x8xf32
+//       CHECK:         vector.transfer_read %[[SVA]]{{.*}} {masked = [false, false]} : memref<4x16xf32{{.*}}>, vector<4x16xf32>
+//       CHECK:         vector.transfer_read %[[SVB]]{{.*}}, %cst {masked = [false, false]} : memref<16x8xf32{{.*}}>, vector<16x8xf32>
+//       CHECK:         vector.contract
+//       CHECK:         scf.yield %{{.*}} : vector<4x8xf32>
+//       CHECK:       }
+//       CHECK:       vector.transfer_write %{{.*}}, %[[SVC]]{{.*}}{masked = [false, false]} : vector<4x8xf32>, memref<4x8xf32
+//   CHECK-NOT:       copy
+//       CHECK:     }
+//       CHECK:   }
+//   CHECK-NOT:   copy

diff --git a/experimental/runners/test/test_matmul_f32_001_exec.mlir b/experimental/runners/test/test_matmul_f32_001_exec.mlir
new file mode 100644
index 0000000..3af463c
--- /dev/null
+++ b/experimental/runners/test/test_matmul_f32_001_exec.mlir

@@ -0,0 +1,13 @@
+// RUN: export M=128 && export N=128 && export K=128 && export ITERS=10 &&\
+// RUN: cat %p/matmul_f32_base.mlir | sed 's@${M}@'"$M"'@g'| sed 's@${K}@'"$K"'@g' | sed 's@${N}@'"$N"'@g'| sed 's@${ITERS}@'"$ITERS"'@g' |\
+
+// RUN: mlir-proto-opt -linalg-tensor-codegen-strategy="anchor-func=init_and_matmul anchor-op=linalg.matmul tile-sizes=4,8,16 vectorize vector-contract-lowering=false" |\
+// RUN: mlir-proto-opt -linalg-comprehensive-bufferize-inplace |\
+// RUN: mlir-opt -convert-vector-to-scf -lower-affine -convert-linalg-to-loops |\
+// RUN: mlir-opt -canonicalize -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \
+
+// RUN: mlir-cpu-runner -O3 -e main -entry-point-result=void \
+// RUN:   -shared-libs=%iree_runners_test_dir/libruntime-support%shlibext |\
+// RUN: tee | FileCheck %s
+
+// CHECK: ( ( 256 ) )

diff --git a/experimental/runners/test/test_matmul_f32_002.mlir b/experimental/runners/test/test_matmul_f32_002.mlir
new file mode 100644
index 0000000..18a0ba9
--- /dev/null
+++ b/experimental/runners/test/test_matmul_f32_002.mlir

@@ -0,0 +1,49 @@
+// RUN: export M=128 && export N=128 && export K=128 && export ITERS=10 &&\
+// RUN: cat %p/matmul_f32_base.mlir | sed 's@${M}@'"$M"'@g'| sed 's@${K}@'"$K"'@g' | sed 's@${N}@'"$N"'@g'| sed 's@${ITERS}@'"$ITERS"'@g' |\
+
+// RUN: mlir-proto-opt -linalg-tensor-codegen-strategy="anchor-func=init_and_matmul anchor-op=linalg.matmul tile-sizes=4,8,16 pad hoist-padding=1" |\
+// RUN: mlir-proto-opt -linalg-tensor-codegen-strategy="anchor-func=init_and_matmul anchor-op=linalg.matmul vectorize vector-contract-lowering=false vectorize-padding" |\
+// RUN: mlir-proto-opt -linalg-comprehensive-bufferize-inplace |\
+// RUN: tee | FileCheck %s
+
+// CHECK-LABEL: func @init_and_matmul(
+//  CHECK-SAME:       %[[A:[0-9a-zA-Z]+]]: memref<
+//  CHECK-SAME:       %[[B:[0-9a-zA-Z]+]]: memref<
+//  CHECK-SAME:       %[[C:[0-9a-zA-Z]+]]: memref<
+//       CHECK:   constant 0.0
+//   CHECK-NOT:   alloc
+//       CHECK:   linalg.fill(%[[C]], %{{.*}}) : memref<128x128xf32>, f32
+//   CHECK-DAG:   %[[PACKED_A:.*]] = alloc() : memref<8x4x16xf32>
+//   CHECK-DAG:   %[[PACKED_B:.*]] = alloc() : memref<8x16x8xf32>
+//   CHECK-NOT:   copy
+//       CHECK:   scf.for %[[I:.*]] =
+//       CHECK:     scf.for %[[J:.*]] =
+//       CHECK:       scf.for %[[K1:.*]] =
+//       CHECK:         %[[PACKED_IDX_B:.*]] = affine.apply
+//       CHECK:         subview %[[B]][%[[K1]], %[[J]]] [16, 8] [1, 1] : memref<128x128xf32> to memref<16x8xf32
+//       CHECK:         subview %[[PACKED_B]][%[[PACKED_IDX_B]], 0, 0] [1, 16, 8] [1, 1, 1] : memref<8x16x8xf32> to memref<16x8xf32
+//       CHECK:         linalg.copy
+//       CHECK:       scf.for %[[K2:.*]] =
+//       CHECK:         %[[PACKED_IDX_A:.*]] = affine.apply
+//       CHECK:         subview %[[A]][%[[I]], %[[K2]]] [4, 16] [1, 1] : memref<128x128xf32> to memref<4x16xf32
+//       CHECK:         subview %[[PACKED_A]][%[[PACKED_IDX_A]], 0, 0] [1, 4, 16] [1, 1, 1] : memref<8x4x16xf32> to memref<4x16xf32
+//       CHECK:         linalg.copy
+//       CHECK:       %[[SVC:.*]] = subview %[[C]]{{.*}} : memref<128x128xf32> to memref<4x8xf32
+//       CHECK:       %[[VC:.*]] = vector.transfer_read %[[SVC]]{{.*}}{masked = [false, false]} : memref<4x8xf32{{.*}}>, vector<4x8xf32>
+//       CHECK:       scf.for %[[K:.*]] = {{.*}} iter_args(%{{.*}} = %[[VC]]) -> (vector<4x8xf32>)
+//       CHECK:         %[[PACKED_IDX:.*]] = affine.apply
+//       CHECK:         %[[SVA:.*]] = subview %[[PACKED_A]][%[[PACKED_IDX]], 0, 0] [1, 4, 16] [1, 1, 1] : memref<8x4x16xf32> to memref<4x16xf32
+//       CHECK:         %[[SVB:.*]] = subview %[[PACKED_B]][%[[PACKED_IDX]], 0, 0] [1, 16, 8] [1, 1, 1] : memref<8x16x8xf32> to memref<16x8xf32
+//       CHECK:         vector.transfer_read %[[SVA]]{{.*}} {masked = [false, false]} : memref<4x16xf32{{.*}}>, vector<4x16xf32>
+//       CHECK:         vector.transfer_read %[[SVB]]{{.*}}, %cst {masked = [false, false]} : memref<16x8xf32{{.*}}>, vector<16x8xf32>
+//       CHECK:         vector.contract
+//       CHECK:         scf.yield %{{.*}} : vector<4x8xf32>
+//   CHECK-NOT:         copy
+//       CHECK:       }
+//       CHECK:       vector.transfer_write %{{.*}}, %[[SVC]]{{.*}}{masked = [false, false]} : vector<4x8xf32>, memref<4x8xf32
+//   CHECK-NOT:       copy
+//       CHECK:     }
+//       CHECK:   }
+//   CHECK-NOT:   copy
+//   CHECK-DAG:   dealloc %[[PACKED_A]]
+//   CHECK-DAG:   dealloc %[[PACKED_B]]

diff --git a/experimental/runners/test/test_matmul_f32_002_exec.mlir b/experimental/runners/test/test_matmul_f32_002_exec.mlir
new file mode 100644
index 0000000..a3ab909
--- /dev/null
+++ b/experimental/runners/test/test_matmul_f32_002_exec.mlir

@@ -0,0 +1,14 @@
+// RUN: export M=128 && export N=128 && export K=128 && export ITERS=10 &&\
+// RUN: cat %p/matmul_f32_base.mlir | sed 's@${M}@'"$M"'@g'| sed 's@${K}@'"$K"'@g' | sed 's@${N}@'"$N"'@g'| sed 's@${ITERS}@'"$ITERS"'@g' |\
+
+// RUN: mlir-proto-opt -linalg-tensor-codegen-strategy="anchor-func=init_and_matmul anchor-op=linalg.matmul tile-sizes=4,8,16 pad hoist-padding=1" |\
+// RUN: mlir-proto-opt -linalg-tensor-codegen-strategy="anchor-func=init_and_matmul anchor-op=linalg.matmul vectorize vectorize-padding" |\
+// RUN: mlir-proto-opt -linalg-comprehensive-bufferize-inplace |\
+// RUN: mlir-opt -convert-vector-to-scf -lower-affine -convert-linalg-to-loops |\
+// RUN: mlir-opt -canonicalize -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm | \
+
+// RUN: mlir-cpu-runner -O3 -e main -entry-point-result=void \
+// RUN:   -shared-libs=%iree_runners_test_dir/libruntime-support%shlibext |\
+// RUN: tee | FileCheck %s
+
+// CHECK: ( ( 256 ) )

diff --git a/experimental/runners/test/test_matmul_f32_003.mlir b/experimental/runners/test/test_matmul_f32_003.mlir
new file mode 100644
index 0000000..b05370e
--- /dev/null
+++ b/experimental/runners/test/test_matmul_f32_003.mlir

@@ -0,0 +1,52 @@
+// RUN: export M=128 && export N=128 && export K=128 && export ITERS=10 &&\
+// RUN: cat %p/matmul_f32_base.mlir | sed 's@${M}@'"$M"'@g'| sed 's@${K}@'"$K"'@g' | sed 's@${N}@'"$N"'@g'| sed 's@${ITERS}@'"$ITERS"'@g' |\
+
+// RUN: mlir-proto-opt -linalg-tensor-codegen-strategy="anchor-func=init_and_matmul anchor-op=linalg.matmul tile-sizes=4,8,16 pad hoist-padding=2" |\
+// RUN: mlir-proto-opt -linalg-tensor-codegen-strategy="anchor-func=init_and_matmul anchor-op=linalg.matmul vectorize vector-contract-lowering=false vectorize-padding" |\
+// RUN: mlir-proto-opt -linalg-comprehensive-bufferize-inplace |\
+// RUN: tee | FileCheck %s
+
+// CHECK-LABEL: func @init_and_matmul(
+//  CHECK-SAME:       %[[A:[0-9a-zA-Z]+]]: memref<
+//  CHECK-SAME:       %[[B:[0-9a-zA-Z]+]]: memref<
+//  CHECK-SAME:       %[[C:[0-9a-zA-Z]+]]: memref<
+//       CHECK:   constant 0.0
+//   CHECK-NOT:   alloc
+//       CHECK:   linalg.fill(%[[C]], %{{.*}}) : memref<128x128xf32>, f32
+//   CHECK-DAG:   %[[PACKED_A:.*]] = alloc() : memref<8x4x16xf32>
+//   CHECK-DAG:   %[[PACKED_B:.*]] = alloc() : memref<8x16x16x8xf32>
+//   CHECK-NOT:   copy
+//       CHECK:   scf.for %[[I:.*]] =
+//       CHECK:     scf.for %[[J1:.*]] =
+//       CHECK:       %[[PACKED_IDX_B_J1:.*]] = affine.apply
+//       CHECK:       scf.for %[[K1:.*]] =
+//       CHECK:         %[[PACKED_IDX_B_K1:.*]] = affine.apply
+//       CHECK:         subview %[[B]][%[[K1]], %[[J1]]] [16, 8] [1, 1] : memref<128x128xf32> to memref<16x8xf32
+//       CHECK:         subview %[[PACKED_B]][%[[PACKED_IDX_B_J1]], %[[PACKED_IDX_B_K1]], 0, 0] [1, 1, 16, 8] [1, 1, 1, 1] : memref<8x16x16x8xf32> to memref<16x8xf32
+//       CHECK:         linalg.copy
+//       CHECK:     scf.for %[[K2:.*]] =
+//       CHECK:       %[[PACKED_IDX_A:.*]] = affine.apply
+//       CHECK:       subview %[[A]][%[[I]], %[[K2]]] [4, 16] [1, 1] : memref<128x128xf32> to memref<4x16xf32
+//       CHECK:       subview %[[PACKED_A]][%[[PACKED_IDX_A]], 0, 0] [1, 4, 16] [1, 1, 1] : memref<8x4x16xf32> to memref<4x16xf32
+//       CHECK:       linalg.copy
+//       CHECK:     scf.for %[[J:.*]] =
+//       CHECK:       %[[PACKED_IDX_J:.*]] = affine.apply
+//       CHECK:       %[[SVC:.*]] = subview %[[C]]{{.*}} : memref<128x128xf32> to memref<4x8xf32
+//       CHECK:       %[[VC:.*]] = vector.transfer_read %[[SVC]]{{.*}}{masked = [false, false]} : memref<4x8xf32{{.*}}>, vector<4x8xf32>
+//       CHECK:       scf.for %[[K:.*]] = {{.*}} iter_args(%{{.*}} = %[[VC]]) -> (vector<4x8xf32>)
+//       CHECK:         %[[PACKED_IDX_K:.*]] = affine.apply
+//       CHECK:         %[[SVA:.*]] = subview %[[PACKED_A]][%[[PACKED_IDX_K]], 0, 0] [1, 4, 16] [1, 1, 1] : memref<8x4x16xf32> to memref<4x16xf32
+//       CHECK:         %[[SVB:.*]] = subview %[[PACKED_B]][%[[PACKED_IDX_K]], %[[PACKED_IDX_J]], 0, 0] [1, 1, 16, 8] [1, 1, 1, 1] : memref<8x16x16x8xf32> to memref<16x8xf32
+//       CHECK:         vector.transfer_read %[[SVA]]{{.*}} {masked = [false, false]} : memref<4x16xf32{{.*}}>, vector<4x16xf32>
+//       CHECK:         vector.transfer_read %[[SVB]]{{.*}}, %cst {masked = [false, false]} : memref<16x8xf32{{.*}}>, vector<16x8xf32>
+//       CHECK:         %[[RES:.*]] = vector.contract
+//       CHECK:         scf.yield %[[RES]] : vector<4x8xf32>
+//   CHECK-NOT:         copy
+//       CHECK:       }
+//       CHECK:       vector.transfer_write %{{.*}}, %[[SVC]]{{.*}}{masked = [false, false]} : vector<4x8xf32>, memref<4x8xf32
+//   CHECK-NOT:       copy
+//       CHECK:     }
+//       CHECK:   }
+//   CHECK-NOT:   copy
+//   CHECK-DAG:   dealloc %[[PACKED_A]]
+//   CHECK-DAG:   dealloc %[[PACKED_B]]

diff --git a/experimental/runners/test/test_matmul_f32_003_exec.mlir b/experimental/runners/test/test_matmul_f32_003_exec.mlir
new file mode 100644
index 0000000..05f7f75
--- /dev/null
+++ b/experimental/runners/test/test_matmul_f32_003_exec.mlir

@@ -0,0 +1,15 @@
+// RUN: export M=128 && export N=128 && export K=128 && export ITERS=1 &&\
+// RUN: cat %p/matmul_f32_base.mlir | sed 's@${M}@'"$M"'@g'| sed 's@${K}@'"$K"'@g' | sed 's@${N}@'"$N"'@g'| sed 's@${ITERS}@'"$ITERS"'@g' |\
+
+// Hoist-padding=2 has some transposition bug, use only square stuff for now
+// RUN: mlir-proto-opt -linalg-tensor-codegen-strategy="anchor-func=init_and_matmul anchor-op=linalg.matmul tile-sizes=8,8,8 pad hoist-padding=2" |\
+// RUN: mlir-proto-opt -linalg-tensor-codegen-strategy="anchor-func=init_and_matmul vectorize-padding" |\
+// RUN: mlir-proto-opt -linalg-comprehensive-bufferize-inplace |\
+// RUN: mlir-opt -convert-vector-to-scf -lower-affine -convert-linalg-to-loops |\
+// RUN: mlir-opt -canonicalize -convert-scf-to-std -convert-vector-to-llvm -convert-std-to-llvm -snapshot-op-locations='filename=/tmp/intermediate_llvm.mlir'| \
+
+// RUN: mlir-cpu-runner -O3 -e main -entry-point-result=void \
+// RUN:   -shared-libs=%iree_runners_test_dir/libruntime-support%shlibext |\
+// RUN: tee | FileCheck %s
+
+// CHECK: ( ( 256 ) )

diff --git a/iree/compiler/Dialect/HAL/Target/LLVM/LLVMTargetOptions.cpp b/iree/compiler/Dialect/HAL/Target/LLVM/LLVMTargetOptions.cpp
index b6c18e0..c8ad6d5 100644
--- a/iree/compiler/Dialect/HAL/Target/LLVM/LLVMTargetOptions.cpp
+++ b/iree/compiler/Dialect/HAL/Target/LLVM/LLVMTargetOptions.cpp

@@ -74,6 +74,20 @@
       llvm::cl::desc("LLVM target machine CPU features; use 'host' for your "
                      "host native CPU"),
       llvm::cl::init(""));
+
+  static llvm::cl::opt<bool> llvmLoopInterleaving(
+      "iree-llvm-loop-interleaving", llvm::cl::init(false),
+      llvm::cl::desc("Enable LLVM loop interleaving opt"));
+  static llvm::cl::opt<bool> llvmLoopVectorization(
+      "iree-llvm-loop-vectorization", llvm::cl::init(true),
+      llvm::cl::desc("Enable LLVM loop vectorization opt"));
+  static llvm::cl::opt<bool> llvmLoopUnrolling(
+      "iree-llvm-loop-unrolling", llvm::cl::init(false),
+      llvm::cl::desc("Enable LLVM loop unrolling opt"));
+  static llvm::cl::opt<bool> llvmSLPVectorization(
+      "iree-llvm-slp-vectorization", llvm::cl::init(false),
+      llvm::cl::desc("Enable LLVM SLP Vectorization opt"));
+
   llvmTargetOptions.targetTriple = clTargetTriple;
   if (clTargetCPU != "host") {
     llvmTargetOptions.targetCPU = clTargetCPU;
@@ -82,6 +96,15 @@
     llvmTargetOptions.targetCPUFeatures = clTargetCPUFeatures;
   }
 
+  // LLVM opt options.
+  llvmTargetOptions.pipelineTuningOptions.LoopInterleaving =
+      llvmLoopInterleaving;
+  llvmTargetOptions.pipelineTuningOptions.LoopVectorization =
+      llvmLoopVectorization;
+  llvmTargetOptions.pipelineTuningOptions.LoopUnrolling = llvmLoopUnrolling;
+  llvmTargetOptions.pipelineTuningOptions.SLPVectorization =
+      llvmSLPVectorization;
+
   static llvm::cl::opt<SanitizerKind> clSanitizerKind(
       "iree-llvm-sanitize", llvm::cl::desc("Apply LLVM sanitize feature"),
       llvm::cl::init(SanitizerKind::kNone),

diff --git a/iree/test/e2e/tosa_ops/transpose.mlir b/iree/test/e2e/tosa_ops/transpose.mlir
new file mode 100644
index 0000000..19c492b
--- /dev/null
+++ b/iree/test/e2e/tosa_ops/transpose.mlir

@@ -0,0 +1,7 @@
+func @test_transpose() attributes { iree.module.export } {
+  %0 = constant dense<[[[0, 1, 2], [3, 4, 5]]]> : tensor<1x2x3xi32>
+  %1 = constant dense<[1, 2, 0]> : tensor<3xi32>
+  %2 = "tosa.transpose"(%0, %1) : (tensor<1x2x3xi32>, tensor<3xi32>) -> (tensor<2x3x1xi32>)
+  check.expect_eq_const(%2, dense<[[[0], [1], [2]], [[3], [4], [5]]]> : tensor<2x3x1xi32>) : tensor<2x3x1xi32>
+  return
+}

diff --git a/iree/test/e2e/xla_ops/BUILD b/iree/test/e2e/xla_ops/BUILD
index 2db32d2..f3e3f3a 100644
--- a/iree/test/e2e/xla_ops/BUILD
+++ b/iree/test/e2e/xla_ops/BUILD

@@ -256,6 +256,32 @@
     target_backend = "vulkan-spirv",
 )
 
+iree_check_single_backend_test_suite(
+    name = "check_linalg_on_tensors_cuda",
+    srcs = [
+        "add.mlir",
+        "broadcast.mlir",
+        "broadcast_add.mlir",
+        "broadcast_in_dim.mlir",
+        "clamp.mlir",
+        "compare.mlir",
+        "constant.mlir",
+        "divide.mlir",
+        "iota.mlir",
+        "maximum.mlir",
+        "minimum.mlir",
+        "multiply.mlir",
+        "negate.mlir",
+        "select.mlir",
+        "subtract.mlir",
+    ],
+    compiler_flags = [
+        "-iree-flow-dispatch-linalg-on-tensors",
+    ],
+    driver = "cuda",
+    target_backend = "cuda",
+)
+
 test_suite(
     name = "check",
     tests = [

diff --git a/iree/test/e2e/xla_ops/CMakeLists.txt b/iree/test/e2e/xla_ops/CMakeLists.txt
index 6d2b15e..e1c0dc0 100644
--- a/iree/test/e2e/xla_ops/CMakeLists.txt
+++ b/iree/test/e2e/xla_ops/CMakeLists.txt

@@ -221,4 +221,31 @@
     "-iree-codegen-spirv-experimental-linalg-on-tensors"
 )
 
+iree_check_single_backend_test_suite(
+  NAME
+    check_linalg_on_tensors_cuda
+  SRCS
+    "add.mlir"
+    "broadcast.mlir"
+    "broadcast_add.mlir"
+    "broadcast_in_dim.mlir"
+    "clamp.mlir"
+    "compare.mlir"
+    "constant.mlir"
+    "divide.mlir"
+    "iota.mlir"
+    "maximum.mlir"
+    "minimum.mlir"
+    "multiply.mlir"
+    "negate.mlir"
+    "select.mlir"
+    "subtract.mlir"
+  TARGET_BACKEND
+    "cuda"
+  DRIVER
+    "cuda"
+  COMPILER_FLAGS
+    "-iree-flow-dispatch-linalg-on-tensors"
+)
+
 ### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###

diff --git a/third_party/mlir-emitc b/third_party/mlir-emitc
index 4e501d8..b57346c 160000
--- a/third_party/mlir-emitc
+++ b/third_party/mlir-emitc

@@ -1 +1 @@
-Subproject commit 4e501d8c6e2d834999301a2492adefe5ddbdc0cb
+Subproject commit b57346cdc50296c0c498aaf20b116b0ff23cb68c