Merge pull request #8268 from google/benvanik-task-wait-source Moving task system waits to a dedicated wait thread.

commit: 82dde83aba50e00b0e8570e41bdf6734e34373c9 [log] [tgz]
author: Ben Vanik <benvanik@google.com> Tue Feb 08 19:55:23 2022 -0800
committer: GitHub <noreply@github.com> Tue Feb 08 19:55:23 2022 -0800
tree: 264f6624eb5926c65ac2b1b5db4bd14b0ee665f5
parent: 8c8d4a85cc010c40562fd5c447df9ca180c93049 [diff]
parent: d4bd1635a9018a73e5a61e939efac459d7b809f4 [diff]
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
new file mode 100644
index 0000000..18fbf8a
--- /dev/null
+++ b/.github/CODEOWNERS

@@ -0,0 +1,12 @@
+# Codeowners for IREE Github Repository.
+# Refer to https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners
+# for syntax of this file.
+
+# Global code owners
+
+# Code owners for individual components/directories
+/iree/compiler/Codegen/ @MaheshRavishankar
+/iree/compiler/Codegen/LLVMCPU/ @MaheshRavishankar @hanhanW
+/iree/compiler/Codegen/LLVMGPU/ @MaheshRavishankar @ThomasRaoux
+/iree/compiler/Codegen/Sandbox/ @MaheshRavishankar @hanhanW
+/iree/compiler/Codegen/SPIRV/ @MaheshRavishankar @antiagainst

diff --git a/.github/workflows/android_tflite_oneshot_build.yml b/.github/workflows/android_tflite_oneshot_build.yml
index b641e44..5a396e4 100644
--- a/.github/workflows/android_tflite_oneshot_build.yml
+++ b/.github/workflows/android_tflite_oneshot_build.yml

@@ -9,7 +9,7 @@
   build_android_with_docker:
     runs-on: ubuntu-latest
     env:
-      ANDROID_CONTAINER: "gcr.io/iree-oss/gradle-android@sha256:d36f3d172c3304e557cd3d9a13bae4590cb2a8d19d229b10f126423e7314a413"
+      ANDROID_CONTAINER: "gcr.io/iree-oss/gradle-android@sha256:bbe2b4576f2a6d8df9d84c42e0fbd3c644edf9512ab3d8fe14ffd887f7ff526c"
     steps:
       - uses: actions/checkout@v2
         with:

diff --git a/.github/workflows/build_package.yml b/.github/workflows/build_package.yml
index 8ee72ae..c2b9573 100644
--- a/.github/workflows/build_package.yml
+++ b/.github/workflows/build_package.yml

@@ -51,10 +51,10 @@
           # Windows packages.
           - os: windows-2019
             build_package: py-compiler-pkg
-            experimental: false
+            experimental: true
           - os: windows-2019
             build_package: py-runtime-pkg
-            experimental: false
+            experimental: true
 
           # Macos packages.
           - os: macos-latest
@@ -66,11 +66,10 @@
     env:
       CIBW_BUILD_VERBOSITY: 1
 
-      # TODO: in-tree-build can be removed once pip 21.3 is released.
       # Note that on Linux, we run under docker with an altered path.
       # Note that on Windows, we need to configure the compiler api project to
       # but its CMake build directory on a short path to avoid path overflow.
-      CIBW_ENVIRONMENT_LINUX: "REPO_DIR=/project/main_checkout BINDIST_DIR=/output CMAKE_GENERATOR=Ninja"
+      CIBW_ENVIRONMENT_LINUX: "REPO_DIR=/project/main_checkout BINDIST_DIR=/output CMAKE_GENERATOR=Ninja IREE_TARGET_BACKEND_CUDA=ON"
       CIBW_ENVIRONMENT_MACOS: "REPO_DIR=${{ github.workspace }}/main_checkout CMAKE_GENERATOR=Ninja"
       CIBW_ENVIRONMENT_WINDOWS: "REPO_DIR='${{ github.workspace }}\\main_checkout' CMAKE_GENERATOR=Ninja IREE_COMPILER_API_CMAKE_BUILD_DIR=D:/b"
 
@@ -191,14 +190,9 @@
         if: "matrix.build_package == 'py-compiler-pkg'"
         shell: bash
         run: |
-          # Need a newer pip to use in-tree-build. *but* the old default pip
-          # can't accept the in-tree-build feature while upgrading itself.
-          # Facepalm. Since there is no way to customize CIBW, we just manually
-          # install a pre-release build of 21.3, in which the in-tree-build
-          # feature is enabled by default, obviating the need to configure it.
-          # After the 2021 October release, this can just be changed to install
-          # pip >= 21.3 from official sources. I'm sorry about this folks.
-          export CIBW_BEFORE_BUILD="python -m pip install --upgrade pip==21.3dev0 -f https://github.com/stellaraccident/pip/releases/tag/21.3dev20210925"
+          # In pip 21.3, in-tree builds became the default and only way to
+          # build. We require that and make sure to be past that threshold.
+          export CIBW_BEFORE_BUILD="python -m pip install --upgrade pip>=21.3"
           python -m cibuildwheel --output-dir bindist ./main_checkout/llvm-external-projects/iree-compiler-api
 
       # Compiler tools wheels are not python version specific, so just build

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3211b9e..695c3d3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt

@@ -4,7 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-cmake_minimum_required(VERSION 3.16.3...3.21)
+cmake_minimum_required(VERSION 3.17...3.22)
 
 # LLVM requires CMP0116 for tblgen: https://reviews.llvm.org/D101083
 # CMP0116: Ninja generators transform `DEPFILE`s from `add_custom_command()`
@@ -74,12 +74,8 @@
 option(IREE_HAL_DRIVER_DEFAULTS "Sets the default value for all runtime HAL drivers" ON)
 option(IREE_TARGET_BACKEND_DEFAULTS "Sets the default value for all compiler target backends" ${IREE_BUILD_COMPILER})
 
-# CUDA is not natively supported on Android or Apple platforms.
-if(ANDROID OR APPLE)
-  set(IREE_HAL_DRIVER_CUDA_DEFAULT OFF)
-else()
-  set(IREE_HAL_DRIVER_CUDA_DEFAULT ${IREE_HAL_DRIVER_DEFAULTS})
-endif()
+# CUDA support must be explicitly enabled.
+set(IREE_HAL_DRIVER_CUDA_DEFAULT OFF)
 
 # Vulkan is not natively supported on Apple platforms.
 # Metal should generally be used instead, though MoltenVK may also work.
@@ -96,7 +92,7 @@
 option(IREE_HAL_DRIVER_VMVX_SYNC "Enables the 'vmvx-sync' runtime HAL driver" ${IREE_HAL_DRIVER_DEFAULTS})
 option(IREE_HAL_DRIVER_VULKAN "Enables the 'vulkan' runtime HAL driver" ${IREE_HAL_DRIVER_VULKAN_DEFAULT})
 
-cmake_dependent_option(IREE_TARGET_BACKEND_CUDA "Enables the 'cuda' compiler target backend" ON ${IREE_BUILD_COMPILER} OFF)
+cmake_dependent_option(IREE_TARGET_BACKEND_CUDA "Enables the 'cuda' compiler target backend" OFF ${IREE_BUILD_COMPILER} OFF)
 cmake_dependent_option(IREE_TARGET_BACKEND_DYLIB_LLVM_AOT "Enables the 'dylib-llvm-aot' compiler target backend" ON ${IREE_BUILD_COMPILER} OFF)
 cmake_dependent_option(IREE_TARGET_BACKEND_METAL_SPIRV "Enables the 'metal-spirv' compiler target backend" ON ${IREE_BUILD_COMPILER} OFF)
 cmake_dependent_option(IREE_TARGET_BACKEND_ROCM "Enables the 'rocm' compiler target backend" ON ${IREE_BUILD_COMPILER} OFF)
@@ -324,6 +320,67 @@
 endif()
 
 #-------------------------------------------------------------------------------
+# CUDA configuration for both the compiler and runtime.
+# We do this at the top level so that we can fail fast and make global
+# decisions that effect both compiler and runtime. It also helps with error
+# messaging to do this all in one place, since we can provide very targeted
+# advice.
+#-------------------------------------------------------------------------------
+
+set(IREE_CUDA_LIBDEVICE_PATH "" CACHE STRING "Absolute path to an appropriate libdevice.*.bc (needed to build the IREE cuda compiler target)")
+
+# If any CUDA features are being built, try to locate a CUDA SDK. We will fall
+# back to this as needed for specific features.
+if(IREE_TARGET_BACKEND_CUDA OR IREE_HAL_DRIVER_CUDA)
+  find_package(CUDAToolkit)
+endif()
+
+# If an explicit libdevice file was not specified, and the compiler backend
+# is being built, probe for one.
+if(IREE_TARGET_BACKEND_CUDA)
+  if(CUDAToolkit_FOUND AND CUDAToolkit_LIBRARY_ROOT)
+    # Note that the variable CUDAToolkit_LIBRARY_ROOT keys off of the presence
+    # of version.txt, which was changed to version.json in recent releases
+    # and thwarts the search.
+    set(IREE_CUDA_LIBDEVICE_PATH "${CUDAToolkit_LIBRARY_ROOT}/nvvm/libdevice/libdevice.10.bc")
+  elseif(CUDAToolkit_FOUND AND CUDAToolkit_BIN_DIR)
+    # Back-track from the bin dir as a fallback.
+    set(IREE_CUDA_LIBDEVICE_PATH "${CUDAToolkit_BIN_DIR}/../nvvm/libdevice/libdevice.10.bc")
+  elseif(CUDAToolkit_ROOT)
+    # Sometimes the CUDA toolkit doesn't detect... because, you know. Computers
+    # are hard and such. In this case, if the user went to the trouble to
+    # tell us where it is, we have enough information.
+    set(IREE_CUDA_LIBDEVICE_PATH "${CUDAToolkit_ROOT}/nvvm/libdevice/libdevice.10.bc")
+  else()
+    message(FATAL_ERROR "Building with IREE_TARGET_BACKEND_CUDA requires either a CUDA SDK (consult CMake docs for your version: https://cmake.org/cmake/help/latest/module/FindCUDAToolkit.html) or an explicit path to libdevice (set with -DIREE_CUDA_LIBDEVICE_PATH=/path/to/libdevice.10.bc)")
+  endif()
+
+  if(EXISTS "${IREE_CUDA_LIBDEVICE_PATH}")
+    message(STATUS "Using CUDA libdevice: ${IREE_CUDA_LIBDEVICE_PATH}")
+  else()
+    message(SEND_ERROR "Cannot find CUDA libdevice file (${IREE_CUDA_LIBDEVICE_PATH}). Either configure your CUDA SDK such that it can be found or specify explicitly via -DIREE_CUDA_LIBDEVICE_PATH=/path/to/libdevice.10.bc")
+  endif()
+endif()
+
+if(IREE_HAL_DRIVER_CUDA)
+  if(CUDAToolkit_FOUND)
+    message(STATUS "Using CUDA INCLUDE_DIRS from found SDK: ${CUDAToolkit_INCLUDE_DIRS}")
+  elseif(CUDAToolkit_ROOT)
+    # See note above about computers being hard.
+    # We make minimal use of CUDA for the runtime and really just need cuda.h
+    # presently. So let's make a guess at that.
+    set(CUDAToolkit_INCLUDE_DIRS "${CUDAToolkit_ROOT}/include")
+    if(EXISTS "${CUDAToolkit_INCLUDE_DIRS}/cuda.h")
+      message(STATUS "Using CUDA INCLUDE_DIRS from CUDAToolkit_ROOT: ${CUDAToolkit_INCLUDE_DIRS}")
+    else()
+      message(SEND_ERROR "Using explicitly specified CUDAToolkit_ROOT, could not find cuda.h at: ${CUDAToolkit_INCLUDE_DIRS}")
+    endif()
+  else()
+    message(SEND_ERROR "Cannot build IREE runtime CUDA components (-DIREE_HAL_DRIVER_CUDA=ON) because a CUDA SDK was not found. Consult CMake docs for your version: https://cmake.org/cmake/help/latest/module/FindCUDAToolkit.html")
+  endif()
+endif()
+
+#-------------------------------------------------------------------------------
 # MLIR/LLVM Dependency
 #-------------------------------------------------------------------------------
 

diff --git a/bindings/python/iree/runtime/CMakeLists.txt b/bindings/python/iree/runtime/CMakeLists.txt
index fcae8c8..f2d62ed 100644
--- a/bindings/python/iree/runtime/CMakeLists.txt
+++ b/bindings/python/iree/runtime/CMakeLists.txt

@@ -34,9 +34,14 @@
     "vm.cc"
   UNIX_LINKER_SCRIPT
     "unix_version.lds"
+  DEFINES
+    # Pybind code seems to be incompatible with C++ allocation tracing
+    # hooks so disable it.
+    IREE_TRACING_HOOK_CPP_NEW_DELETE=0
   DEPS
     iree::base
     iree::base::cc
+    iree::base::internal::flags
     iree::hal
     iree::hal::drivers
     iree::modules::hal
@@ -50,6 +55,7 @@
   SRCS
     "__init__.py"
     "array_interop.py"
+    "flags.py"
     "function.py"
     "system_api.py"
     "tracing.py"
@@ -100,6 +106,13 @@
 
 iree_py_test(
   NAME
+    flags_test
+  SRCS
+    "flags_test.py"
+)
+
+iree_py_test(
+  NAME
     function_test
   SRCS
     "function_test.py"

diff --git a/bindings/python/iree/runtime/__init__.py b/bindings/python/iree/runtime/__init__.py
index 9d2e1f0..e0d8643 100644
--- a/bindings/python/iree/runtime/__init__.py
+++ b/bindings/python/iree/runtime/__init__.py

@@ -43,3 +43,5 @@
 from .system_api import *
 from .function import *
 from .tracing import *
+
+from . import flags

diff --git a/bindings/python/iree/runtime/flags.py b/bindings/python/iree/runtime/flags.py
new file mode 100644
index 0000000..a7b1020
--- /dev/null
+++ b/bindings/python/iree/runtime/flags.py

@@ -0,0 +1,12 @@
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from .binding import parse_flags
+
+# When enabled, performs additional function input validation checks. In the
+# event of errors, this will yield nicer error messages but comes with a
+# runtime cost.
+FUNCTION_INPUT_VALIDATION = True

diff --git a/bindings/python/iree/runtime/flags_test.py b/bindings/python/iree/runtime/flags_test.py
new file mode 100644
index 0000000..886176a
--- /dev/null
+++ b/bindings/python/iree/runtime/flags_test.py

@@ -0,0 +1,24 @@
+# Copyright 2019 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from iree import runtime as rt
+import numpy as np
+import unittest
+
+
+class FlagsTest(unittest.TestCase):
+
+  def testParse(self):
+    # We always have the logging verbose level available so use it.
+    rt.flags.parse_flags("--iree_v=1")
+
+  def testParseError(self):
+    with self.assertRaisesRegex(ValueError, "flag 'barbar' not recognized"):
+      rt.flags.parse_flags("--barbar")
+
+
+if __name__ == "__main__":
+  unittest.main()

diff --git a/bindings/python/iree/runtime/function.py b/bindings/python/iree/runtime/function.py
index 57739b8..d0ebd79 100644
--- a/bindings/python/iree/runtime/function.py
+++ b/bindings/python/iree/runtime/function.py

@@ -28,6 +28,8 @@
     map_dtype_to_element_type,
     DeviceArray,
 )
+from .flags import (
+    FUNCTION_INPUT_VALIDATION,)
 
 __all__ = [
     "FunctionInvoker",
@@ -140,7 +142,7 @@
       _merge_python_sequence_to_vm(inv, arg_list, args, self._arg_descs)
       if call_trace:
         call_trace.add_vm_list(arg_list, "args")
-      self._vm_context.invoke(self._vm_function, arg_list, ret_list)
+      self._invoke(arg_list, ret_list)
       if call_trace:
         call_trace.add_vm_list(ret_list, "results")
 
@@ -162,6 +164,10 @@
       if call_trace:
         call_trace.end_call()
 
+  # Break out invoke so it shows up in profiles.
+  def _invoke(self, arg_list, ret_list):
+    self._vm_context.invoke(self._vm_function, arg_list, ret_list)
+
   def _parse_abi_dict(self, vm_function: VmFunction):
     reflection = vm_function.reflection
     abi_json = reflection.get("iree.abi")
@@ -284,7 +290,7 @@
 
 def _ndarray_to_vm(inv: Invocation, t: VmVariantList, x, desc):
   # Validate and implicit conversion against type descriptor.
-  if desc is not None:
+  if FUNCTION_INPUT_VALIDATION and desc is not None:
     desc_type = desc[0]
     if desc_type != "ndarray":
       _raise_argument_error(inv, f"passed an ndarray but expected {desc_type}")
@@ -343,7 +349,7 @@
 def _ndarray_like_to_vm(inv: Invocation, t: VmVariantList, x, desc):
   if isinstance(x, HalBufferView):
     return _buffer_view_to_vm(inv, t, x, desc)
-  return _ndarray_to_vm(inv, t, np.asarray(x), desc)
+  return _ndarray_to_vm(inv, t, x, desc)
 
 
 class _MissingArgument:
@@ -532,7 +538,7 @@
   # For dynamic mode, just assume we have the right arity.
   if descs is None:
     descs = [None] * len(py_list)
-  else:
+  elif FUNCTION_INPUT_VALIDATION:
     len_py_list = sum([1 for x in py_list if x is not MissingArgument])
     if len(py_list) != len_py_list:
       _raise_argument_error(

diff --git a/bindings/python/iree/runtime/function_test.py b/bindings/python/iree/runtime/function_test.py
index b194db5..0a013cc 100644
--- a/bindings/python/iree/runtime/function_test.py
+++ b/bindings/python/iree/runtime/function_test.py

@@ -265,6 +265,32 @@
     self.assertEqual("<VmVariantList(1): [HalBufferView(2:0x20000011)]>",
                      repr(invoked_arg_list))
 
+  def testDeviceArrayArg(self):
+    # Note that since the device array is set up to disallow implicit host
+    # transfers, this also verifies that no accidental/automatic transfers
+    # are done as part of marshalling the array to the function.
+    arg_array = rt.asdevicearray(self.device,
+                                 np.asarray([1, 0], dtype=np.int32),
+                                 implicit_host_transfer=False)
+
+    invoked_arg_list = None
+
+    def invoke(arg_list, ret_list):
+      nonlocal invoked_arg_list
+      invoked_arg_list = arg_list
+
+    vm_context = MockVmContext(invoke)
+    vm_function = MockVmFunction(reflection={
+        "iree.abi": json.dumps({
+            "a": [["ndarray", "i32", 1, 2]],
+            "r": [],
+        })
+    })
+    invoker = FunctionInvoker(vm_context, self.device, vm_function, tracer=None)
+    result = invoker(arg_array)
+    self.assertEqual("<VmVariantList(1): [HalBufferView(2:0x20000011)]>",
+                     repr(invoked_arg_list))
+
   def testBufferViewArg(self):
     arg_buffer_view = self.device.allocator.allocate_buffer_copy(
         memory_type=IMPLICIT_BUFFER_ARG_MEMORY_TYPE,

diff --git a/bindings/python/iree/runtime/initialize_module.cc b/bindings/python/iree/runtime/initialize_module.cc
index 334668a..b48586c 100644
--- a/bindings/python/iree/runtime/initialize_module.cc
+++ b/bindings/python/iree/runtime/initialize_module.cc

@@ -8,6 +8,7 @@
 #include "bindings/python/iree/runtime/hal.h"
 #include "bindings/python/iree/runtime/status_utils.h"
 #include "bindings/python/iree/runtime/vm.h"
+#include "iree/base/internal/flags.h"
 #include "iree/base/status_cc.h"
 #include "iree/hal/drivers/init.h"
 
@@ -21,6 +22,26 @@
   m.doc() = "IREE Binding Backend Helpers";
   SetupHalBindings(m);
   SetupVmBindings(m);
+
+  m.def("parse_flags", [](py::args py_flags) {
+    std::vector<std::string> alloced_flags;
+    alloced_flags.push_back("python");
+    for (auto &py_flag : py_flags) {
+      alloced_flags.push_back(py::cast<std::string>(py_flag));
+    }
+
+    // Must build pointer vector after filling so pointers are stable.
+    std::vector<char *> flag_ptrs;
+    for (auto &alloced_flag : alloced_flags) {
+      flag_ptrs.push_back(const_cast<char *>(alloced_flag.c_str()));
+    }
+
+    char **argv = &flag_ptrs[0];
+    int argc = flag_ptrs.size();
+    CheckApiStatus(
+        iree_flags_parse(IREE_FLAGS_PARSE_MODE_DEFAULT, &argc, &argv),
+        "Error parsing flags");
+  });
 }
 
 }  // namespace python

diff --git a/build_tools/bazel/iree.bazelrc b/build_tools/bazel/iree.bazelrc
index dd6bc43..ab5d786 100644
--- a/build_tools/bazel/iree.bazelrc
+++ b/build_tools/bazel/iree.bazelrc

@@ -262,7 +262,7 @@
 # specific docker container the CI Bazel builds are run in. The image URL is
 # included for clarity and so that this reference is automatically updated by
 # manage_images.py
-build:remote_cache_bazel_ci --host_platform_remote_properties_override='properties:{name:"cache-silo-key" value:"gcr.io/iree-oss/frontends-swiftshader@sha256:e70c6524cea980b9c6077a76e0ac90464c47eff4f27b6c74b8eaab3ff1fb35fc"}'
+build:remote_cache_bazel_ci --host_platform_remote_properties_override='properties:{name:"cache-silo-key" value:"gcr.io/iree-oss/frontends-swiftshader@sha256:4327fef0ed53c8658f052d145f2aa8ce395c29f48fb075df78638fd3ce3603e5"}'
 
 ###############################################################################
 # Configuration for uploading build results to Result Store UI

diff --git a/build_tools/bazel/iree_check_test.bzl b/build_tools/bazel/iree_check_test.bzl
index b34ef02..a72fe41 100644
--- a/build_tools/bazel/iree_check_test.bzl
+++ b/build_tools/bazel/iree_check_test.bzl

@@ -127,6 +127,9 @@
 
     tests = []
     for src in srcs:
+        # CUDA backend/driver not supported by Bazel build.
+        if target_backend == "cuda" or driver == "cuda":
+            continue
         test_name = "_".join([name, src])
         iree_check_test(
             name = test_name,
@@ -198,6 +201,9 @@
     # could just create a test suite. The latter seems simpler and more readable.
     tests = []
     for backend, driver in target_backends_and_drivers:
+        # CUDA backend/driver not supported by Bazel build.
+        if backend == "cuda" or driver == "cuda":
+            continue
         suite_name = "_".join([name, backend, driver])
         iree_check_single_backend_test_suite(
             name = suite_name,

diff --git a/build_tools/bazel/iree_trace_runner_test.bzl b/build_tools/bazel/iree_trace_runner_test.bzl
index c0e6b79..a6fb7d0 100644
--- a/build_tools/bazel/iree_trace_runner_test.bzl
+++ b/build_tools/bazel/iree_trace_runner_test.bzl

@@ -207,7 +207,11 @@
         opt_tool: Defaulting to iree-opt. Tool used to preprocess the source files
             if opt_flags is specified.
         opt_flags: If specified, source files are preprocessed with opt_tool with
-            these flags.
+            these flags. The special string "#pass_options_variant#" is replaced
+            with the empty string. That may in the future be changed to some
+            automatically determined pass options for each entry in
+            target_cpu_features_variants, as is currently done in the CMake
+            build.
         trace_runner: trace-runner program to run.
         timeout: timeout for the generated tests.
         target_cpu_features_variants: list of target cpu features variants. Currently unimplemented, so each
@@ -221,6 +225,7 @@
             fail("Entry %s in target_cpu_features_variants: unimplemented" % target_cpu_features)
 
     tests = []
+    processed_opt_flags = [flag.replace("#pass_options_variant#", "") for flag in opt_flags]
     for backend, driver in target_backends_and_drivers:
         suite_entry_name = "_".join([name, backend, driver])
         iree_single_backend_generated_trace_runner_test(
@@ -233,7 +238,7 @@
             compiler_flags = compiler_flags,
             runner_args = runner_args,
             opt_tool = opt_tool,
-            opt_flags = opt_flags,
+            opt_flags = processed_opt_flags,
             tags = tags,
             timeout = timeout,
             **kwargs

diff --git a/build_tools/buildkite/cmake/android/arm64-v8a/benchmark2.yml b/build_tools/buildkite/cmake/android/arm64-v8a/benchmark2.yml
index 788f04f..4e1eb94 100644
--- a/build_tools/buildkite/cmake/android/arm64-v8a/benchmark2.yml
+++ b/build_tools/buildkite/cmake/android/arm64-v8a/benchmark2.yml

@@ -9,7 +9,7 @@
 steps:
   - label: "Build"
     commands:
-      - "docker run --user=$(id -u):$(id -g) --volume=\\${HOME?}:\\${HOME?} --volume=/etc/passwd:/etc/passwd:ro --volume=/etc/group:/etc/group:ro --volume=\\$PWD:\\$IREE_DOCKER_WORKDIR --workdir=\\$IREE_DOCKER_WORKDIR --rm gcr.io/iree-oss/frontends@sha256:e7faf90e0f77ad6f9236df88e7af26447644c0860610c2cb7a3202c2b961795f build_tools/cmake/build_android_benchmark.sh"
+      - "docker run --user=$(id -u):$(id -g) --volume=\\${HOME?}:\\${HOME?} --volume=/etc/passwd:/etc/passwd:ro --volume=/etc/group:/etc/group:ro --volume=\\$PWD:\\$IREE_DOCKER_WORKDIR --workdir=\\$IREE_DOCKER_WORKDIR --rm gcr.io/iree-oss/frontends@sha256:c720dc8788a49c5a3cbd6b947cd37360ac9160dac043170dd513c8eafb3c5818 build_tools/cmake/build_android_benchmark.sh"
       - "tar --exclude='*.tar.gz' --exclude='*.tgz' --exclude='*.mlir' --exclude='*.tflite' -czvf benchmark-suites-${BUILDKITE_BUILD_NUMBER}.tgz build-host/benchmark_suites"
       - "tar -czvf iree-android-tools-${BUILDKITE_BUILD_NUMBER}.tgz build-android/iree/tools/iree-benchmark-module build-android-trace/iree/tools/iree-benchmark-module"
     if: "build.pull_request.id == null || (build.pull_request.labels includes 'buildkite:benchmark')"

diff --git a/build_tools/buildkite/cmake/android/arm64-v8a/pipeline.yml b/build_tools/buildkite/cmake/android/arm64-v8a/pipeline.yml
index 975b0a4..6447ff6 100644
--- a/build_tools/buildkite/cmake/android/arm64-v8a/pipeline.yml
+++ b/build_tools/buildkite/cmake/android/arm64-v8a/pipeline.yml

@@ -8,7 +8,7 @@
   - label: "build"
     commands:
       - "git submodule sync && git submodule update --init --jobs 8 --depth 1"
-      - "docker run --user=$(id -u):$(id -g) --volume=\\$PWD:\\$IREE_DOCKER_WORKDIR --workdir=\\$IREE_DOCKER_WORKDIR --rm gcr.io/iree-oss/android@sha256:22368d0c424160c68109ff33772b84b4633304d315709de959353db8ffd81c52 build_tools/cmake/build_android.sh arm64-v8a"
+      - "docker run --user=$(id -u):$(id -g) --volume=\\$PWD:\\$IREE_DOCKER_WORKDIR --workdir=\\$IREE_DOCKER_WORKDIR --rm gcr.io/iree-oss/android@sha256:a98f4d2299257c311ecf182e3ecbdf24f228ce55d22ed9f6b229a92c96a16c86 build_tools/cmake/build_android.sh arm64-v8a"
       - "tar --exclude='*.o' --exclude='*.a' -czvf build-artifacts.tgz build-android"
     agents:
       - "queue=build"

diff --git a/build_tools/buildkite/cmake/build_configurations.yml b/build_tools/buildkite/cmake/build_configurations.yml
index c36be78..354c92b 100644
--- a/build_tools/buildkite/cmake/build_configurations.yml
+++ b/build_tools/buildkite/cmake/build_configurations.yml

@@ -8,7 +8,7 @@
   - label: ":zap: Build with tracing enabled"
     commands:
       - "git submodule sync && git submodule update --init --jobs 8 --depth 1"
-      - "docker run --user=$(id -u):$(id -g) --volume=\\$PWD:\\$IREE_DOCKER_WORKDIR --workdir=\\$IREE_DOCKER_WORKDIR --rm gcr.io/iree-oss/base@sha256:ec10501f22c04c2199c68df5f6782946224fb63568e2a1701ddb3c4928a42bb5 ./build_tools/cmake/build_tracing.sh"
+      - "docker run --user=$(id -u):$(id -g) --volume=\\$PWD:\\$IREE_DOCKER_WORKDIR --workdir=\\$IREE_DOCKER_WORKDIR --rm gcr.io/iree-oss/base@sha256:ea076afa9ec854ed75d0608a08d64e4bc5e1ad2933d497912b4034553d4a56ac ./build_tools/cmake/build_tracing.sh"
     env:
       IREE_DOCKER_WORKDIR: "/usr/src/github/iree"
     agents:
@@ -17,7 +17,7 @@
   - label: ":hammer_and_wrench: Build the runtime only"
     commands:
       - "git submodule sync && git submodule update --init --jobs 8 --depth 1"
-      - "docker run --user=$(id -u):$(id -g) --volume=\\$PWD:\\$IREE_DOCKER_WORKDIR --workdir=\\$IREE_DOCKER_WORKDIR --rm gcr.io/iree-oss/base@sha256:ec10501f22c04c2199c68df5f6782946224fb63568e2a1701ddb3c4928a42bb5 ./build_tools/cmake/build_runtime.sh"
+      - "docker run --user=$(id -u):$(id -g) --volume=\\$PWD:\\$IREE_DOCKER_WORKDIR --workdir=\\$IREE_DOCKER_WORKDIR --rm gcr.io/iree-oss/base@sha256:ea076afa9ec854ed75d0608a08d64e4bc5e1ad2933d497912b4034553d4a56ac ./build_tools/cmake/build_runtime.sh"
     env:
       IREE_DOCKER_WORKDIR: "/usr/src/github/iree"
     agents:
@@ -26,7 +26,7 @@
   - label: ":pinching_hand: Build the size-optimized runtime only"
     commands:
       - "git submodule sync && git submodule update --init --jobs 8 --depth 1"
-      - "docker run --user=$(id -u):$(id -g) --volume=\\$PWD:\\$IREE_DOCKER_WORKDIR --workdir=\\$IREE_DOCKER_WORKDIR --rm gcr.io/iree-oss/base@sha256:ec10501f22c04c2199c68df5f6782946224fb63568e2a1701ddb3c4928a42bb5 ./build_tools/cmake/build_runtime_small.sh"
+      - "docker run --user=$(id -u):$(id -g) --volume=\\$PWD:\\$IREE_DOCKER_WORKDIR --workdir=\\$IREE_DOCKER_WORKDIR --rm gcr.io/iree-oss/base@sha256:ea076afa9ec854ed75d0608a08d64e4bc5e1ad2933d497912b4034553d4a56ac ./build_tools/cmake/build_runtime_small.sh"
     env:
       IREE_DOCKER_WORKDIR: "/usr/src/github/iree"
     agents:
@@ -36,7 +36,7 @@
     key: "build-gcc"
     commands:
       - "git submodule sync && git submodule update --init --jobs 8 --depth 1"
-      - "docker run --env CC=/usr/bin/gcc-9 --env CXX=/usr/bin/g++-9 --user=$(id -u):$(id -g) --volume=\\$PWD:\\$IREE_DOCKER_WORKDIR --workdir=\\$IREE_DOCKER_WORKDIR --rm gcr.io/iree-oss/base@sha256:ec10501f22c04c2199c68df5f6782946224fb63568e2a1701ddb3c4928a42bb5 ./build_tools/cmake/clean_build.sh"
+      - "docker run --env CC=/usr/bin/gcc-9 --env CXX=/usr/bin/g++-9 --user=$(id -u):$(id -g) --volume=\\$PWD:\\$IREE_DOCKER_WORKDIR --workdir=\\$IREE_DOCKER_WORKDIR --rm gcr.io/iree-oss/base@sha256:ea076afa9ec854ed75d0608a08d64e4bc5e1ad2933d497912b4034553d4a56ac ./build_tools/cmake/clean_build.sh"
     env:
       IREE_DOCKER_WORKDIR: "/usr/src/github/iree"
     agents:
@@ -46,7 +46,7 @@
     key: "build-host-install"
     commands:
       - "git submodule sync && git submodule update --init --jobs 8 --depth 1"
-      - "docker run --user=$(id -u):$(id -g) --volume=\\$PWD:\\$IREE_DOCKER_WORKDIR --workdir=\\$IREE_DOCKER_WORKDIR --rm gcr.io/iree-oss/base@sha256:ec10501f22c04c2199c68df5f6782946224fb63568e2a1701ddb3c4928a42bb5 ./build_tools/cmake/build_host_install.sh"
+      - "docker run --user=$(id -u):$(id -g) --volume=\\$PWD:\\$IREE_DOCKER_WORKDIR --workdir=\\$IREE_DOCKER_WORKDIR --rm gcr.io/iree-oss/base@sha256:ea076afa9ec854ed75d0608a08d64e4bc5e1ad2933d497912b4034553d4a56ac ./build_tools/cmake/build_host_install.sh"
       - "tar -czvf build-artifacts.tgz build-host/install"
     artifact_paths: "build-artifacts.tgz"
     env:
@@ -60,7 +60,7 @@
       - "buildkite-agent artifact download --step build-host-install build-artifacts.tgz ./"
       - "tar xzf build-artifacts.tgz"
       - "git submodule update --init --jobs 8 --depth 1"
-      - "docker run --user=$(id -u):$(id -g) --volume=\\$PWD:\\$IREE_DOCKER_WORKDIR --workdir=\\$IREE_DOCKER_WORKDIR --rm gcr.io/iree-oss/emscripten@sha256:60daf8bdb36e7e120687197d07f41aebd4965805dbb9f2978622cc36f0a90840 ./build_tools/cmake/build_runtime_emscripten.sh"
+      - "docker run --user=$(id -u):$(id -g) --volume=\\$PWD:\\$IREE_DOCKER_WORKDIR --workdir=\\$IREE_DOCKER_WORKDIR --rm gcr.io/iree-oss/emscripten@sha256:2d99a86203c4de572a8928892cbfcd41d8e16ca432c53ad01ed33d2fb991ad78 ./build_tools/cmake/build_runtime_emscripten.sh"
     env:
       IREE_DOCKER_WORKDIR: "/usr/src/github/iree"
     agents:

diff --git a/build_tools/buildkite/samples.yml b/build_tools/buildkite/samples.yml
index 3096d35..d4ba6d4 100644
--- a/build_tools/buildkite/samples.yml
+++ b/build_tools/buildkite/samples.yml

@@ -7,7 +7,7 @@
 steps:
   - label: "Test Colab notebooks"
     commands:
-      - "docker run --user=$(id -u):$(id -g) --volume=\\$PWD:\\$IREE_DOCKER_WORKDIR --workdir=\\$IREE_DOCKER_WORKDIR --rm gcr.io/iree-oss/samples@sha256:6cb69f51c69e6cf8ad57db04bbf80227bd3efd9b014111a702e81b716309f486 python3 colab/test_notebooks.py"
+      - "docker run --user=$(id -u):$(id -g) --volume=\\$PWD:\\$IREE_DOCKER_WORKDIR --workdir=\\$IREE_DOCKER_WORKDIR --rm gcr.io/iree-oss/samples@sha256:c57ab23ad4b1e1241be1228030a7d4a6b729b9ed5ee1c126d1e9098c1dc84902 python3 colab/test_notebooks.py"
     env:
       IREE_DOCKER_WORKDIR: "/usr/src/github/iree"
     agents:
@@ -15,7 +15,7 @@
 
   - label: "Test Samples"
     commands:
-      - "docker run --user=$(id -u):$(id -g) --volume=\\$PWD:\\$IREE_DOCKER_WORKDIR --workdir=\\$IREE_DOCKER_WORKDIR --rm gcr.io/iree-oss/samples@sha256:6cb69f51c69e6cf8ad57db04bbf80227bd3efd9b014111a702e81b716309f486 build_tools/testing/test_samples.sh"
+      - "docker run --user=$(id -u):$(id -g) --volume=\\$PWD:\\$IREE_DOCKER_WORKDIR --workdir=\\$IREE_DOCKER_WORKDIR --rm gcr.io/iree-oss/samples@sha256:c57ab23ad4b1e1241be1228030a7d4a6b729b9ed5ee1c126d1e9098c1dc84902 build_tools/testing/test_samples.sh"
     env:
       IREE_DOCKER_WORKDIR: "/usr/src/github/iree"
     agents:

diff --git a/build_tools/cmake/benchmark_compilation_flagfile.in b/build_tools/cmake/benchmark_compilation_flagfile.in
new file mode 100644
index 0000000..cf37cf9
--- /dev/null
+++ b/build_tools/cmake/benchmark_compilation_flagfile.in

@@ -0,0 +1 @@
+@IREE_BENCHMARK_COMPILATION_FLAGS@

diff --git a/build_tools/cmake/clean_build.sh b/build_tools/cmake/clean_build.sh
index 543412b..0af2e4e 100755
--- a/build_tools/cmake/clean_build.sh
+++ b/build_tools/cmake/clean_build.sh

@@ -17,4 +17,4 @@
 
 cd ${ROOT_DIR?}
 rm -rf build/
-./build_tools/cmake/rebuild.sh
+./build_tools/cmake/rebuild.sh "$@"

diff --git a/build_tools/cmake/iree_benchmark_suite.cmake b/build_tools/cmake/iree_benchmark_suite.cmake
index fb6e845..4e9e4e8 100644
--- a/build_tools/cmake/iree_benchmark_suite.cmake
+++ b/build_tools/cmake/iree_benchmark_suite.cmake

@@ -283,6 +283,14 @@
         DEPENDS "${_TOOL_FILE}"
       )
 
+      # Generate a flagfile containing command-line options used to compile the
+      # generated artifacts.
+      set(_COMPOPT_FILE "${_RUN_SPEC_DIR}/compilation_flagfile")
+      string(REPLACE ";" "\n" IREE_BENCHMARK_COMPILATION_FLAGS "${_TRANSLATION_ARGS}")
+      configure_file(
+        ${PROJECT_SOURCE_DIR}/build_tools/cmake/benchmark_compilation_flagfile.in
+        ${_COMPOPT_FILE})
+
       # Mark dependency so that we have one target to drive them all.
       add_dependencies(iree-benchmark-suites
         "${_FLAGFILE_GEN_TARGET_NAME}"

diff --git a/build_tools/cmake/iree_bytecode_module.cmake b/build_tools/cmake/iree_bytecode_module.cmake
index f918702..125670c 100644
--- a/build_tools/cmake/iree_bytecode_module.cmake
+++ b/build_tools/cmake/iree_bytecode_module.cmake

@@ -91,6 +91,7 @@
       DEPENDS
         ${_OPT_TOOL_EXECUTABLE}
         ${_RULE_SRC}
+      VERBATIM
     )
   else()
     # OPT_FLAGS was not specified, so are not using the OPT_TOOL.
@@ -123,6 +124,7 @@
       ${_TRANSLATE_TOOL_EXECUTABLE}
       ${_EMBEDDED_LINKER_TOOL_EXECUTABLE}
       ${_TRANSLATE_SRC}
+    VERBATIM
   )
 
   if(_RULE_TESTONLY)

diff --git a/build_tools/cmake/iree_c_embed_data.cmake b/build_tools/cmake/iree_c_embed_data.cmake
index 659e256..34a6aa4 100644
--- a/build_tools/cmake/iree_c_embed_data.cmake
+++ b/build_tools/cmake/iree_c_embed_data.cmake

@@ -12,7 +12,8 @@
 #
 # Parameters:
 # NAME: Name of target (see Note).
-# SRCS: List of source files to embed.
+# SRCS: List of source files to embed (non-absolute paths will be resolved
+#     relative to CMAKE_CURRENT_SRC_DIR).
 # GENERATED_SRCS: List of generated source files to embed.
 # C_FILE_OUTPUT: The C implementation file to output.
 # H_FILE_OUTPUT: The H header file to output.
@@ -57,7 +58,11 @@
   endif()
 
   foreach(SRC ${_RULE_SRCS})
-    list(APPEND _ARGS "${CMAKE_CURRENT_SOURCE_DIR}/${SRC}")
+    if(IS_ABSOLUTE "${SRC}")
+      list(APPEND _ARGS "${SRC}")
+    else()
+      list(APPEND _ARGS "${CMAKE_CURRENT_SOURCE_DIR}/${SRC}")
+    endif()
   endforeach(SRC)
   foreach(SRC ${_RULE_GENERATED_SRCS})
     list(APPEND _ARGS "${SRC}")

diff --git a/build_tools/cmake/iree_check_test.cmake b/build_tools/cmake/iree_check_test.cmake
index 97cf9fd..9492f42 100644
--- a/build_tools/cmake/iree_check_test.cmake
+++ b/build_tools/cmake/iree_check_test.cmake

@@ -294,25 +294,32 @@
 # Helper function parsing a string occurring as an entry in TARGET_CPU_FEATURES_VARIANTS.
 #
 # This function has 3 output-params: variables that it sets with PARENT_SCOPE:
-# _ENABLED, _TARGET_CPU_FEATURES, _TARGET_CPU_FEATURES_SUFFIX.
+# _ENABLED, _TARGET_CPU_FEATURES, _TARGET_CPU_FEATURES_SUFFIX, _TARGET_PASS_OPTIONS.
 #
 # "default" is handled specially. _ENABLED is always set to "TRUE" and
-# _TARGET_CPU_FEATURES and _TARGET_CPU_FEATURES_SUFFIX are both set to the
-# empty string.
+# _TARGET_CPU_FEATURES, _TARGET_CPU_FEATURES_SUFFIX and _TARGET_PASS_OPTIONS are set to
+# the empty string.
 #
 # Other values are parsed as "arch:features", the parsed arch is matched with
 # `CMAKE_SYSTEM_PROCESSOR`, `_ENABLED` is set to "TRUE" if and only if they
-# match, and `_TARGET_CPU_FEATURES_SUFFIX` is set to a string based on the
-# features that is appropriate to include in a CMake target or test name. More
-# than one target cpu feature is currently unsupported.
-# aarch64:+dotprod -> _TARGET_CPU_FEATURES="+dotprod", _TARGET_CPU_FEATURES_SUFFIX="_dotprod"
-# default -> _TARGET_CPU_FEATURES="", _TARGET_CPU_FEATURES_SUFFIX="", ENABLED="TRUE"
+# match, `_TARGET_CPU_FEATURES_SUFFIX` is set to a string based on the
+# features that is appropriate to include in a CMake target or test name, and
+# `_TARGET_PASS_OPTIONS` is formatted to be passed as options to certain passes that
+# expect "arch=<arch> features=<+feature1,...>".
+# More than one target cpu feature is currently unsupported.
+#
+# aarch64:+dotprod ->_ENABLED="TRUE" if the target architecture is aarch64,
+#                    _TARGET_CPU_FEATURES="+dotprod",
+#                    _TARGET_CPU_FEATURES_SUFFIX="_dotprod",
+#                    _TARGET_PASS_OPTIONS="arch=aarch64 features=+dotprod"
+# default -> _ENABLED="TRUE" unconditionally, other output strings are "".
 function(process_target_cpu_features _INPUT_TARGET_CPU_FEATURES _ENABLED
-         _TARGET_CPU_FEATURES _TARGET_CPU_FEATURES_SUFFIX)
+         _TARGET_CPU_FEATURES _TARGET_CPU_FEATURES_SUFFIX _TARGET_PASS_OPTIONS)
+  set(_TARGET_CPU_FEATURES "" PARENT_SCOPE)
+  set(_TARGET_CPU_FEATURES_SUFFIX "" PARENT_SCOPE)
+  set(_TARGET_PASS_OPTIONS "" PARENT_SCOPE)
   if ("${_INPUT_TARGET_CPU_FEATURES}" STREQUAL "default")
     set(_ENABLED "TRUE" PARENT_SCOPE)
-    set(_TARGET_CPU_FEATURES "" PARENT_SCOPE)
-    set(_TARGET_CPU_FEATURES_SUFFIX "" PARENT_SCOPE)
     return()
   endif()
   string(REGEX MATCHALL "[^:]+" _COMPONENTS "${_INPUT_TARGET_CPU_FEATURES}")
@@ -349,8 +356,11 @@
 TARGET_CPU_FEATURES should match [a-zA-Z0-9]+ after the initial +. \
 Got: ${_TARGET_CPU_FEATURES}.")
     endif()
+    # Generate the target cpu features suffix string with underscores ('_')
+    # separating the features.
     string(REPLACE "+" "_" _TARGET_CPU_FEATURES_SUFFIX_LOCAL "${_TARGET_CPU_FEATURES}")
     set(_TARGET_CPU_FEATURES_SUFFIX "${_TARGET_CPU_FEATURES_SUFFIX_LOCAL}" PARENT_SCOPE)
+    set(_TARGET_PASS_OPTIONS "arch=${_FILTER_ARCH} features=${_TARGET_CPU_FEATURES}" PARENT_SCOPE)
   else()
     set(_ENABLED "FALSE" PARENT_SCOPE)
   endif()
@@ -425,7 +435,8 @@
       set(_TARGET_CPU_FEATURES_VARIANTS "default")
     endif()
     foreach(_TARGET_CPU_FEATURES_LIST_ELEM IN LISTS _TARGET_CPU_FEATURES_VARIANTS)
-      process_target_cpu_features("${_TARGET_CPU_FEATURES_LIST_ELEM}" _ENABLED _TARGET_CPU_FEATURES _TARGET_CPU_FEATURES_SUFFIX)
+      process_target_cpu_features("${_TARGET_CPU_FEATURES_LIST_ELEM}" _ENABLED _TARGET_CPU_FEATURES _TARGET_CPU_FEATURES_SUFFIX _TARGET_PASS_OPTIONS)
+      string(REPLACE "#pass_options_variant#" "${_TARGET_PASS_OPTIONS}" _PROCESSED_OPT_FLAGS "${_RULE_OPT_FLAGS}")
       if (NOT _ENABLED)
         # The current entry is disabled on the target CPU architecture.
         continue()

diff --git a/build_tools/cmake/iree_python.cmake b/build_tools/cmake/iree_python.cmake
index 094056a..48fd3d3 100644
--- a/build_tools/cmake/iree_python.cmake
+++ b/build_tools/cmake/iree_python.cmake

@@ -142,11 +142,14 @@
 # MODULE_NAME: Base-name of the module.
 # SRCS: List of source files for the library
 # DEPS: List of other targets the test python libraries require
+# COPTS: List of private compile options
+# DEFINES: List of public defines
+# INCLUDES: Include directories to add to dependencies
 function(iree_pyext_module)
   cmake_parse_arguments(ARG
     ""
     "NAME;MODULE_NAME;UNIX_LINKER_SCRIPT"
-    "SRCS;DEPS;COPTS;INCLUDES"
+    "SRCS;DEPS;COPTS;DEFINES;INCLUDES"
     ${ARGN})
 
   iree_package_ns(_PACKAGE_NS)
@@ -216,6 +219,11 @@
     ${_RTTI_AND_EXCEPTION_COPTS}
   )
 
+  target_compile_definitions(
+    ${_NAME} PUBLIC
+    ${ARG_DEFINES}
+  )
+
   # Link flags.
   if(UNIX AND NOT APPLE)  # Apple does not support linker scripts.
     if(ARG_UNIX_LINKER_SCRIPT)

diff --git a/build_tools/cmake/iree_trace_runner_test.cmake b/build_tools/cmake/iree_trace_runner_test.cmake
index b32fcdf..30f3e2f 100644
--- a/build_tools/cmake/iree_trace_runner_test.cmake
+++ b/build_tools/cmake/iree_trace_runner_test.cmake

@@ -342,7 +342,8 @@
       set(_TARGET_CPU_FEATURES_VARIANTS "default")
     endif()
     foreach(_TARGET_CPU_FEATURES_LIST_ELEM IN LISTS _TARGET_CPU_FEATURES_VARIANTS)
-      process_target_cpu_features("${_TARGET_CPU_FEATURES_LIST_ELEM}" _ENABLED _TARGET_CPU_FEATURES _TARGET_CPU_FEATURES_SUFFIX)
+      process_target_cpu_features("${_TARGET_CPU_FEATURES_LIST_ELEM}" _ENABLED _TARGET_CPU_FEATURES _TARGET_CPU_FEATURES_SUFFIX _TARGET_PASS_OPTIONS)
+      string(REPLACE "#pass_options_variant#" "${_TARGET_PASS_OPTIONS}" _PROCESSED_OPT_FLAGS "${_RULE_OPT_FLAGS}")
       if (NOT _ENABLED)
         # The current entry is disabled on the target CPU architecture.
         continue()
@@ -369,7 +370,7 @@
         OPT_TOOL
           ${_RULE_OPT_TOOL}
         OPT_FLAGS
-          ${_RULE_OPT_FLAGS}
+          ${_PROCESSED_OPT_FLAGS}
         TARGET_CPU_FEATURES
           ${_TARGET_CPU_FEATURES}
       )

diff --git a/build_tools/cmake/rebuild.sh b/build_tools/cmake/rebuild.sh
index 68e9420..4151aa5 100755
--- a/build_tools/cmake/rebuild.sh
+++ b/build_tools/cmake/rebuild.sh

@@ -50,5 +50,5 @@
   "-DIREE_ENABLE_ASSERTIONS=ON"
 )
 
-"$CMAKE_BIN" "${CMAKE_ARGS[@]?}" ..
+"$CMAKE_BIN" "${CMAKE_ARGS[@]?}" "$@" ..
 "$CMAKE_BIN" --build .

diff --git a/build_tools/docker/android/Dockerfile b/build_tools/docker/android/Dockerfile
index bb425ec..e23e6ad 100644
--- a/build_tools/docker/android/Dockerfile
+++ b/build_tools/docker/android/Dockerfile

@@ -6,7 +6,7 @@
 
 # An image for cross-compiling IREE towards Android.
 
-FROM gcr.io/iree-oss/base@sha256:ec10501f22c04c2199c68df5f6782946224fb63568e2a1701ddb3c4928a42bb5
+FROM gcr.io/iree-oss/base@sha256:ea076afa9ec854ed75d0608a08d64e4bc5e1ad2933d497912b4034553d4a56ac
 ARG NDK_VERSION=r21d
 WORKDIR /install-ndk
 

diff --git a/build_tools/docker/base/Dockerfile b/build_tools/docker/base/Dockerfile
index 7c7040e..ac12ef3 100644
--- a/build_tools/docker/base/Dockerfile
+++ b/build_tools/docker/base/Dockerfile

@@ -50,8 +50,8 @@
 # to get the /usr/share path.
 # See https://github.com/moby/moby/issues/41383
 ARG CMAKE_MAJOR_VERSION=3
-ARG CMAKE_MINOR_VERSION=16
-ARG CMAKE_PATCH_VERSION=3
+ARG CMAKE_MINOR_VERSION=17
+ARG CMAKE_PATCH_VERSION=5
 
 ENV CMAKE_VERSION="${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}.${CMAKE_PATCH_VERSION}"
 

diff --git a/build_tools/docker/emscripten/Dockerfile b/build_tools/docker/emscripten/Dockerfile
index f8c8538..86dc6da 100644
--- a/build_tools/docker/emscripten/Dockerfile
+++ b/build_tools/docker/emscripten/Dockerfile

@@ -6,7 +6,7 @@
 
 # An image for building IREE through Emscripten.
 
-FROM gcr.io/iree-oss/base@sha256:ec10501f22c04c2199c68df5f6782946224fb63568e2a1701ddb3c4928a42bb5
+FROM gcr.io/iree-oss/base@sha256:ea076afa9ec854ed75d0608a08d64e4bc5e1ad2933d497912b4034553d4a56ac
 
 # See also
 #   * https://github.com/emscripten-core/emsdk/blob/main/docker/Dockerfile

diff --git a/build_tools/docker/frontends-nvidia/Dockerfile b/build_tools/docker/frontends-nvidia/Dockerfile
index 5197437..d589810 100644
--- a/build_tools/docker/frontends-nvidia/Dockerfile
+++ b/build_tools/docker/frontends-nvidia/Dockerfile

@@ -8,7 +8,7 @@
 # The NVidia drivers need to *exactly* match between the host machine and the
 # docker image.
 
-FROM gcr.io/iree-oss/frontends@sha256:e7faf90e0f77ad6f9236df88e7af26447644c0860610c2cb7a3202c2b961795f
+FROM gcr.io/iree-oss/frontends@sha256:c720dc8788a49c5a3cbd6b947cd37360ac9160dac043170dd513c8eafb3c5818
 
 # We use .deb files that we host because we have to pin the version exactly to
 # match the host machine and packages routinely dissapear from the Ubuntu

diff --git a/build_tools/docker/frontends-swiftshader/Dockerfile b/build_tools/docker/frontends-swiftshader/Dockerfile
index dc50792..ad8c30d 100644
--- a/build_tools/docker/frontends-swiftshader/Dockerfile
+++ b/build_tools/docker/frontends-swiftshader/Dockerfile

@@ -4,8 +4,8 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-FROM gcr.io/iree-oss/frontends@sha256:e7faf90e0f77ad6f9236df88e7af26447644c0860610c2cb7a3202c2b961795f
-COPY --from=gcr.io/iree-oss/swiftshader@sha256:2fc835bd93f2fb5ec763b882bb697e5293c24b071f9e4c3e6c0ee5d290c3d347 \
+FROM gcr.io/iree-oss/frontends@sha256:c720dc8788a49c5a3cbd6b947cd37360ac9160dac043170dd513c8eafb3c5818
+COPY --from=gcr.io/iree-oss/swiftshader@sha256:9a83eb64f53c354772fbe53d4de6669eea7acb39d373b134f66b17fbdce22936 \
   /swiftshader /swiftshader
 
 # Set VK_ICD_FILENAMES so Vulkan loader can find the SwiftShader ICD.

diff --git a/build_tools/docker/frontends/Dockerfile b/build_tools/docker/frontends/Dockerfile
index 826c215..7960407 100644
--- a/build_tools/docker/frontends/Dockerfile
+++ b/build_tools/docker/frontends/Dockerfile

@@ -4,7 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-FROM gcr.io/iree-oss/android@sha256:22368d0c424160c68109ff33772b84b4633304d315709de959353db8ffd81c52
+FROM gcr.io/iree-oss/android@sha256:a98f4d2299257c311ecf182e3ecbdf24f228ce55d22ed9f6b229a92c96a16c86
 
 WORKDIR /install-kws
 
@@ -22,9 +22,8 @@
 WORKDIR /
 
 RUN python3 -m pip install --upgrade \
-  # Matching versions of TF and Keras
-  tf-nightly==2.7.0.dev20210806 \
-  keras-nightly==2.7.0.dev2021080600 \
+  tensorflow==2.7.1 \
+  keras==2.7.0 \
   # JAX.
   jax \
   jaxlib \

diff --git a/build_tools/docker/gradle-android/Dockerfile b/build_tools/docker/gradle-android/Dockerfile
index 625c5bc..39c56a0 100644
--- a/build_tools/docker/gradle-android/Dockerfile
+++ b/build_tools/docker/gradle-android/Dockerfile

@@ -7,7 +7,7 @@
 # An image for cross-compiling IREE's TFLite Java Bindings with Gradle and
 # CMake.
 
-FROM gcr.io/iree-oss/base@sha256:ec10501f22c04c2199c68df5f6782946224fb63568e2a1701ddb3c4928a42bb5
+FROM gcr.io/iree-oss/base@sha256:ea076afa9ec854ed75d0608a08d64e4bc5e1ad2933d497912b4034553d4a56ac
 
 ### Java ###
 WORKDIR /install-jdk

diff --git a/build_tools/docker/nvidia/Dockerfile b/build_tools/docker/nvidia/Dockerfile
index 551d113..41554bc 100644
--- a/build_tools/docker/nvidia/Dockerfile
+++ b/build_tools/docker/nvidia/Dockerfile

@@ -16,7 +16,7 @@
 ARG NVIDIA_COMMON_DEB="libnvidia-common-460_460.39-0ubuntu0.18.04.1_all.deb"
 
 
-FROM gcr.io/iree-oss/base@sha256:ec10501f22c04c2199c68df5f6782946224fb63568e2a1701ddb3c4928a42bb5 AS fetch-nvidia
+FROM gcr.io/iree-oss/base@sha256:ea076afa9ec854ed75d0608a08d64e4bc5e1ad2933d497912b4034553d4a56ac AS fetch-nvidia
 ARG NVIDIA_COMMON_DEB
 ARG NVIDIA_GL_DEB
 ARG NVIDIA_COMPUTE_DEB
@@ -36,7 +36,7 @@
 #      does not support Ubuntu 18.04.
 # This allows to share configuration with base CMake, but it also means we need
 # to MATCH the driver version between the host machine and the docker image.
-FROM gcr.io/iree-oss/base@sha256:ec10501f22c04c2199c68df5f6782946224fb63568e2a1701ddb3c4928a42bb5 AS final
+FROM gcr.io/iree-oss/base@sha256:ea076afa9ec854ed75d0608a08d64e4bc5e1ad2933d497912b4034553d4a56ac AS final
 ARG NVIDIA_COMMON_DEB
 ARG NVIDIA_GL_DEB
 ARG NVIDIA_COMPUTE_DEB
@@ -50,3 +50,10 @@
 RUN apt-get install "/tmp/${NVIDIA_COMMON_DEB?}" \
   "/tmp/${NVIDIA_GL_DEB?}" \
   "/tmp/${NVIDIA_COMPUTE_DEB?}"
+
+# install cuda sdk
+RUN wget https://developer.download.nvidia.com/compute/cuda/11.6.0/local_installers/cuda-repo-debian11-11-6-local_11.6.0-510.39.01-1_amd64.deb \
+  && dpkg --install cuda-repo-debian11-11-6-local_11.6.0-510.39.01-1_amd64.deb \
+  && apt-key add /var/cuda-repo-debian11-11-6-local/7fa2af80.pub \
+  && apt-get update \
+  && apt-get -y install cuda-toolkit-11-6

diff --git a/build_tools/docker/prod_digests.txt b/build_tools/docker/prod_digests.txt
index f658b81..a94ebd7 100644
--- a/build_tools/docker/prod_digests.txt
+++ b/build_tools/docker/prod_digests.txt

@@ -1,11 +1,11 @@
-gcr.io/iree-oss/base@sha256:ec10501f22c04c2199c68df5f6782946224fb63568e2a1701ddb3c4928a42bb5
-gcr.io/iree-oss/swiftshader@sha256:2fc835bd93f2fb5ec763b882bb697e5293c24b071f9e4c3e6c0ee5d290c3d347
-gcr.io/iree-oss/samples@sha256:6cb69f51c69e6cf8ad57db04bbf80227bd3efd9b014111a702e81b716309f486
-gcr.io/iree-oss/frontends@sha256:e7faf90e0f77ad6f9236df88e7af26447644c0860610c2cb7a3202c2b961795f
-gcr.io/iree-oss/frontends-nvidia@sha256:651e1c1c8dbe1cdfcca9648436e0c6fc9869d5df085b1b129438bf016288bef2
-gcr.io/iree-oss/frontends-swiftshader@sha256:e70c6524cea980b9c6077a76e0ac90464c47eff4f27b6c74b8eaab3ff1fb35fc
-gcr.io/iree-oss/gradle-android@sha256:d36f3d172c3304e557cd3d9a13bae4590cb2a8d19d229b10f126423e7314a413
-gcr.io/iree-oss/riscv@sha256:24a401f9d47a16a0b460bf34a8485f087b7d88944ed661dd1201ce2d6c19df3e
-gcr.io/iree-oss/nvidia@sha256:b27c2feb5fc0e7125c5dd933c65c0f7029c157343c91269a17e892e76dd433a3
-gcr.io/iree-oss/emscripten@sha256:60daf8bdb36e7e120687197d07f41aebd4965805dbb9f2978622cc36f0a90840
-gcr.io/iree-oss/android@sha256:22368d0c424160c68109ff33772b84b4633304d315709de959353db8ffd81c52
+gcr.io/iree-oss/base@sha256:ea076afa9ec854ed75d0608a08d64e4bc5e1ad2933d497912b4034553d4a56ac
+gcr.io/iree-oss/swiftshader@sha256:9a83eb64f53c354772fbe53d4de6669eea7acb39d373b134f66b17fbdce22936
+gcr.io/iree-oss/samples@sha256:c57ab23ad4b1e1241be1228030a7d4a6b729b9ed5ee1c126d1e9098c1dc84902
+gcr.io/iree-oss/frontends@sha256:c720dc8788a49c5a3cbd6b947cd37360ac9160dac043170dd513c8eafb3c5818
+gcr.io/iree-oss/frontends-nvidia@sha256:05c1694aaa71bfc778472582c14fa95a4490398224a39d0d6837ed3012d2c4c8
+gcr.io/iree-oss/frontends-swiftshader@sha256:4327fef0ed53c8658f052d145f2aa8ce395c29f48fb075df78638fd3ce3603e5
+gcr.io/iree-oss/gradle-android@sha256:bbe2b4576f2a6d8df9d84c42e0fbd3c644edf9512ab3d8fe14ffd887f7ff526c
+gcr.io/iree-oss/riscv@sha256:a4da7dbebd4fac1cc2f674eb2a49d718a21d3604e39976b673814864afc44e3d
+gcr.io/iree-oss/nvidia@sha256:c1053062515d6fe9becc38305a70dc3fba486068513e2102844b407488fb5629
+gcr.io/iree-oss/emscripten@sha256:2d99a86203c4de572a8928892cbfcd41d8e16ca432c53ad01ed33d2fb991ad78
+gcr.io/iree-oss/android@sha256:a98f4d2299257c311ecf182e3ecbdf24f228ce55d22ed9f6b229a92c96a16c86

diff --git a/build_tools/docker/riscv/Dockerfile b/build_tools/docker/riscv/Dockerfile
index 68f1053..9c3914a 100644
--- a/build_tools/docker/riscv/Dockerfile
+++ b/build_tools/docker/riscv/Dockerfile

@@ -6,7 +6,7 @@
 
 # An image for cross-compiling IREE towards RISCV using CMake.
 
-FROM gcr.io/iree-oss/base@sha256:ec10501f22c04c2199c68df5f6782946224fb63568e2a1701ddb3c4928a42bb5 AS install-riscv
+FROM gcr.io/iree-oss/base@sha256:ea076afa9ec854ed75d0608a08d64e4bc5e1ad2933d497912b4034553d4a56ac AS install-riscv
 WORKDIR /install-riscv
 RUN wget "https://storage.googleapis.com/iree-shared-files/toolchain_iree_rvv-intrinsic.tar.gz"
 RUN tar -xf "toolchain_iree_rvv-intrinsic.tar.gz" -C /usr/src/
@@ -15,7 +15,7 @@
 RUN wget "https://storage.googleapis.com/iree-shared-files/qemu-riscv.tar.gz"
 RUN tar -xf "qemu-riscv.tar.gz" -C /usr/src/
 
-FROM gcr.io/iree-oss/base@sha256:ec10501f22c04c2199c68df5f6782946224fb63568e2a1701ddb3c4928a42bb5 AS final
+FROM gcr.io/iree-oss/base@sha256:ea076afa9ec854ed75d0608a08d64e4bc5e1ad2933d497912b4034553d4a56ac AS final
 COPY --from=install-riscv "/usr/src/toolchain_iree" "/usr/src/toolchain_iree"
 COPY --from=install-riscv "/usr/src/toolchain_iree_rv32imf" "/usr/src/toolchain_iree_rv32imf"
 COPY --from=install-riscv "/usr/src/qemu-riscv" "/usr/src/qemu-riscv"

diff --git a/build_tools/docker/samples/Dockerfile b/build_tools/docker/samples/Dockerfile
index 7558051..5d3960a 100644
--- a/build_tools/docker/samples/Dockerfile
+++ b/build_tools/docker/samples/Dockerfile

@@ -9,7 +9,7 @@
 #   * Vulkan (using SwiftShader)
 #   * Python (including `venv` and common pip packages needed for Colab)
 
-FROM gcr.io/iree-oss/swiftshader@sha256:2fc835bd93f2fb5ec763b882bb697e5293c24b071f9e4c3e6c0ee5d290c3d347
+FROM gcr.io/iree-oss/swiftshader@sha256:9a83eb64f53c354772fbe53d4de6669eea7acb39d373b134f66b17fbdce22936
 
 # Update setuptools per https://github.com/pypa/setuptools/issues/1694#issuecomment-466010982
 RUN apt-get update \

diff --git a/build_tools/docker/swiftshader/Dockerfile b/build_tools/docker/swiftshader/Dockerfile
index fdfc4ce..561fcb6 100644
--- a/build_tools/docker/swiftshader/Dockerfile
+++ b/build_tools/docker/swiftshader/Dockerfile

@@ -4,7 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-FROM gcr.io/iree-oss/base@sha256:ec10501f22c04c2199c68df5f6782946224fb63568e2a1701ddb3c4928a42bb5 AS install-swiftshader
+FROM gcr.io/iree-oss/base@sha256:ea076afa9ec854ed75d0608a08d64e4bc5e1ad2933d497912b4034553d4a56ac AS install-swiftshader
 WORKDIR /install-swiftshader
 
 RUN apt-get update && apt-get install -y git
@@ -33,7 +33,7 @@
 RUN echo "${SWIFTSHADER_COMMIT?}" > /swiftshader/git-commit
 
 # Ubuntu 18.04
-FROM gcr.io/iree-oss/base@sha256:ec10501f22c04c2199c68df5f6782946224fb63568e2a1701ddb3c4928a42bb5 AS final
+FROM gcr.io/iree-oss/base@sha256:ea076afa9ec854ed75d0608a08d64e4bc5e1ad2933d497912b4034553d4a56ac AS final
 COPY --from=install-swiftshader /swiftshader /swiftshader
 
 # Set VK_ICD_FILENAMES so Vulkan loader can find the SwiftShader ICD.

diff --git a/build_tools/github_actions/build_dist.py b/build_tools/github_actions/build_dist.py
index fa3b6ea..965779c 100644
--- a/build_tools/github_actions/build_dist.py
+++ b/build_tools/github_actions/build_dist.py

@@ -124,6 +124,16 @@
   # Clean up install and build trees.
   shutil.rmtree(INSTALL_DIR, ignore_errors=True)
   remove_cmake_cache()
+  extra_cmake_flags = []
+
+  # Enable CUDA if on platforms where we expect to have the deps and produce
+  # such binaries.
+  if platform.system() == "Linux":
+    print("*** Enabling CUDA compiler target and runtime ***")
+    extra_cmake_flags.extend([
+        "-DIREE_TARGET_BACKEND_CUDA=ON",
+        "-DIREE_HAL_DRIVER_CUDA=ON",
+    ])
 
   # CMake configure.
   print("*** Configuring ***")
@@ -131,12 +141,13 @@
       sys.executable,
       CMAKE_CI_SCRIPT,
       f"-B{BUILD_DIR}",
+      "--log-level=VERBOSE",
       f"-DCMAKE_INSTALL_PREFIX={INSTALL_DIR}",
       f"-DCMAKE_BUILD_TYPE=Release",
       f"-DIREE_BUILD_COMPILER=ON",
       f"-DIREE_BUILD_PYTHON_BINDINGS=OFF",
       f"-DIREE_BUILD_SAMPLES=OFF",
-  ],
+  ] + extra_cmake_flags,
                  check=True)
 
   print("*** Building ***")
@@ -192,12 +203,21 @@
         f"-DIREE_BUILD_TRACY=ON",
     ])
 
+  # Enable CUDA if on platforms where we expect to have the deps and produce
+  # such binaries.
+  if platform.system() == "Linux":
+    print("*** Enabling CUDA runtime ***")
+    extra_cmake_flags.extend([
+        "-DIREE_HAL_DRIVER_CUDA=ON",
+    ])
+
   # CMake configure.
   print("*** Configuring ***")
   subprocess.run([
       sys.executable,
       CMAKE_CI_SCRIPT,
       f"-B{BUILD_DIR}",
+      "--log-level=VERBOSE",
       f"-DCMAKE_INSTALL_PREFIX={INSTALL_DIR}",
       f"-DCMAKE_BUILD_TYPE=Release",
       f"-DIREE_BUILD_COMPILER=OFF",

diff --git a/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-swiftshader/core/build_kokoro.sh b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-swiftshader/core/build_kokoro.sh
index 9b184d8..0f401ea 100755
--- a/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-swiftshader/core/build_kokoro.sh
+++ b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-swiftshader/core/build_kokoro.sh

@@ -28,7 +28,7 @@
 # and the cache key is the docker container it's run in (to ensure correct cache
 # hits).
 docker run "${DOCKER_RUN_ARGS[@]?}" \
-  gcr.io/iree-oss/frontends-swiftshader@sha256:e70c6524cea980b9c6077a76e0ac90464c47eff4f27b6c74b8eaab3ff1fb35fc \
+  gcr.io/iree-oss/frontends-swiftshader@sha256:4327fef0ed53c8658f052d145f2aa8ce395c29f48fb075df78638fd3ce3603e5 \
   build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-swiftshader/core/build.sh
 
 # Kokoro will rsync this entire directory back to the executor orchestrating the

diff --git a/build_tools/kokoro/gcp_ubuntu/cmake-bazel/linux/x86-swiftshader/build_kokoro.sh b/build_tools/kokoro/gcp_ubuntu/cmake-bazel/linux/x86-swiftshader/build_kokoro.sh
index c1f2a78..b050c07 100755
--- a/build_tools/kokoro/gcp_ubuntu/cmake-bazel/linux/x86-swiftshader/build_kokoro.sh
+++ b/build_tools/kokoro/gcp_ubuntu/cmake-bazel/linux/x86-swiftshader/build_kokoro.sh

@@ -24,7 +24,7 @@
 docker_setup
 
 docker run "${DOCKER_RUN_ARGS[@]?}" \
-  gcr.io/iree-oss/frontends-swiftshader@sha256:e70c6524cea980b9c6077a76e0ac90464c47eff4f27b6c74b8eaab3ff1fb35fc \
+  gcr.io/iree-oss/frontends-swiftshader@sha256:4327fef0ed53c8658f052d145f2aa8ce395c29f48fb075df78638fd3ce3603e5 \
   build_tools/kokoro/gcp_ubuntu/cmake-bazel/linux/x86-swiftshader/build.sh
 
 # Kokoro will rsync this entire directory back to the executor orchestrating the

diff --git a/build_tools/kokoro/gcp_ubuntu/cmake-bazel/linux/x86-turing/build.sh b/build_tools/kokoro/gcp_ubuntu/cmake-bazel/linux/x86-turing/build.sh
index a1cff76..0baddcd 100755
--- a/build_tools/kokoro/gcp_ubuntu/cmake-bazel/linux/x86-turing/build.sh
+++ b/build_tools/kokoro/gcp_ubuntu/cmake-bazel/linux/x86-turing/build.sh

@@ -57,6 +57,8 @@
 # TODO(gcmn): It would be nice to be able to build and test as much as possible,
 # so a build failure only prevents building/testing things that depend on it and
 # we can still run the other tests.
+# TODO: Add "-DIREE_TARGET_BACKEND_CUDA=ON -DIREE_HAL_DRIVER_CUDA=ON" once the
+# VMs have been updated with the correct CUDA SDK.
 echo "Configuring CMake"
 "${CMAKE_BIN}" -B "${CMAKE_BUILD_DIR?}" -G Ninja \
    -DIREE_TF_TOOLS_ROOT="${BAZEL_BINDIR?}/iree_tf_compiler/" \

diff --git a/build_tools/kokoro/gcp_ubuntu/cmake-bazel/linux/x86-turing/build_kokoro.sh b/build_tools/kokoro/gcp_ubuntu/cmake-bazel/linux/x86-turing/build_kokoro.sh
index 9789e69..7edc8b2 100755
--- a/build_tools/kokoro/gcp_ubuntu/cmake-bazel/linux/x86-turing/build_kokoro.sh
+++ b/build_tools/kokoro/gcp_ubuntu/cmake-bazel/linux/x86-turing/build_kokoro.sh

@@ -30,7 +30,7 @@
 
 docker run "${DOCKER_RUN_ARGS[@]?}" \
   --gpus all \
-  gcr.io/iree-oss/frontends-nvidia@sha256:651e1c1c8dbe1cdfcca9648436e0c6fc9869d5df085b1b129438bf016288bef2 \
+  gcr.io/iree-oss/frontends-nvidia@sha256:05c1694aaa71bfc778472582c14fa95a4490398224a39d0d6837ed3012d2c4c8 \
   build_tools/kokoro/gcp_ubuntu/cmake-bazel/linux/x86-turing/build.sh
 
 # Kokoro will rsync this entire directory back to the executor orchestrating the

diff --git a/build_tools/kokoro/gcp_ubuntu/cmake/android/arm64-v8a/build_kokoro.sh b/build_tools/kokoro/gcp_ubuntu/cmake/android/arm64-v8a/build_kokoro.sh
index 0d2bbf1..9c27ac6 100755
--- a/build_tools/kokoro/gcp_ubuntu/cmake/android/arm64-v8a/build_kokoro.sh
+++ b/build_tools/kokoro/gcp_ubuntu/cmake/android/arm64-v8a/build_kokoro.sh

@@ -26,7 +26,7 @@
 # Need to use frontends image (which also has Android toolchain) to build the
 # TFLite compiler for generating benchmarks.
 docker run "${DOCKER_RUN_ARGS[@]?}" \
-  gcr.io/iree-oss/frontends@sha256:e7faf90e0f77ad6f9236df88e7af26447644c0860610c2cb7a3202c2b961795f \
+  gcr.io/iree-oss/frontends@sha256:c720dc8788a49c5a3cbd6b947cd37360ac9160dac043170dd513c8eafb3c5818 \
   build_tools/kokoro/gcp_ubuntu/cmake/android/build.sh arm64-v8a
 
 # Kokoro will rsync this entire directory back to the executor orchestrating the

diff --git a/build_tools/kokoro/gcp_ubuntu/cmake/baremetal/riscv32/build_kokoro.sh b/build_tools/kokoro/gcp_ubuntu/cmake/baremetal/riscv32/build_kokoro.sh
index 41e9b2a..5e80ed7 100755
--- a/build_tools/kokoro/gcp_ubuntu/cmake/baremetal/riscv32/build_kokoro.sh
+++ b/build_tools/kokoro/gcp_ubuntu/cmake/baremetal/riscv32/build_kokoro.sh

@@ -24,7 +24,7 @@
 docker_setup
 
 docker run "${DOCKER_RUN_ARGS[@]?}" \
-  gcr.io/iree-oss/riscv@sha256:24a401f9d47a16a0b460bf34a8485f087b7d88944ed661dd1201ce2d6c19df3e \
+  gcr.io/iree-oss/riscv@sha256:a4da7dbebd4fac1cc2f674eb2a49d718a21d3604e39976b673814864afc44e3d \
   build_tools/kokoro/gcp_ubuntu/cmake/baremetal/riscv32/build.sh
 
 # Kokoro will rsync this entire directory back to the executor orchestrating the

diff --git a/build_tools/kokoro/gcp_ubuntu/cmake/linux/riscv64/build_kokoro.sh b/build_tools/kokoro/gcp_ubuntu/cmake/linux/riscv64/build_kokoro.sh
index 27aa323..eec1d3f 100755
--- a/build_tools/kokoro/gcp_ubuntu/cmake/linux/riscv64/build_kokoro.sh
+++ b/build_tools/kokoro/gcp_ubuntu/cmake/linux/riscv64/build_kokoro.sh

@@ -24,7 +24,7 @@
 docker_setup
 
 docker run "${DOCKER_RUN_ARGS[@]?}" \
-  gcr.io/iree-oss/riscv@sha256:24a401f9d47a16a0b460bf34a8485f087b7d88944ed661dd1201ce2d6c19df3e \
+  gcr.io/iree-oss/riscv@sha256:a4da7dbebd4fac1cc2f674eb2a49d718a21d3604e39976b673814864afc44e3d \
   build_tools/kokoro/gcp_ubuntu/cmake/linux/riscv64/build.sh
 
 # Kokoro will rsync this entire directory back to the executor orchestrating the

diff --git a/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader-asan/build_kokoro.sh b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader-asan/build_kokoro.sh
index 3b558af..ee3f7e3 100755
--- a/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader-asan/build_kokoro.sh
+++ b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader-asan/build_kokoro.sh

@@ -24,7 +24,7 @@
 docker_setup
 
 docker run "${DOCKER_RUN_ARGS[@]?}" \
-  gcr.io/iree-oss/swiftshader@sha256:2fc835bd93f2fb5ec763b882bb697e5293c24b071f9e4c3e6c0ee5d290c3d347 \
+  gcr.io/iree-oss/swiftshader@sha256:9a83eb64f53c354772fbe53d4de6669eea7acb39d373b134f66b17fbdce22936 \
   build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader-asan/build.sh
 
 # Kokoro will rsync this entire directory back to the executor orchestrating the

diff --git a/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/build_kokoro.sh b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/build_kokoro.sh
index 1fd156d..90b26e1 100755
--- a/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/build_kokoro.sh
+++ b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/build_kokoro.sh

@@ -24,7 +24,7 @@
 docker_setup
 
 docker run "${DOCKER_RUN_ARGS[@]?}" \
-  gcr.io/iree-oss/swiftshader@sha256:2fc835bd93f2fb5ec763b882bb697e5293c24b071f9e4c3e6c0ee5d290c3d347 \
+  gcr.io/iree-oss/swiftshader@sha256:9a83eb64f53c354772fbe53d4de6669eea7acb39d373b134f66b17fbdce22936 \
   build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/build.sh
 
 # Kokoro will rsync this entire directory back to the executor orchestrating the

diff --git a/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-turing/build.sh b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-turing/build.sh
index 8363787..f6c6d79 100755
--- a/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-turing/build.sh
+++ b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-turing/build.sh

@@ -34,7 +34,39 @@
 # so a build failure only prevents building/testing things that depend on it and
 # we can still run the other tests.
 echo "Building with cmake"
-./build_tools/cmake/clean_build.sh
+
+ROOT_DIR=$(git rev-parse --show-toplevel)
+
+cd ${ROOT_DIR?}
+rm -rf build/
+
+CMAKE_BIN=${CMAKE_BIN:-$(which cmake)}
+
+"$CMAKE_BIN" --version
+ninja --version
+
+mkdir build
+cd build
+
+CMAKE_ARGS=(
+  "-G" "Ninja"
+  # Let's make linking fast
+  "-DIREE_ENABLE_LLD=ON"
+  "-DCMAKE_BUILD_TYPE=RelWithDebInfo"
+
+  "-DIREE_BUILD_PYTHON_BINDINGS=ON"
+
+  "-DIREE_ENABLE_ASSERTIONS=ON"
+
+  # Enable CUDA backend to test on Turing hardware.
+  "-DIREE_TARGET_BACKEND_CUDA=ON"
+  "-DIREE_HAL_DRIVER_CUDA=ON"
+)
+
+"$CMAKE_BIN" "${CMAKE_ARGS[@]?}" "$@" ..
+"$CMAKE_BIN" --build .
+
+cd ${ROOT_DIR?}
 
 export IREE_VULKAN_F16_DISABLE=0
 export IREE_CUDA_DISABLE=0

diff --git a/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-turing/build_kokoro.sh b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-turing/build_kokoro.sh
index b6b7ff1..8127131 100755
--- a/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-turing/build_kokoro.sh
+++ b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-turing/build_kokoro.sh

@@ -30,7 +30,7 @@
 
 docker run "${DOCKER_RUN_ARGS[@]?}" \
   --gpus all \
-  gcr.io/iree-oss/nvidia@sha256:b27c2feb5fc0e7125c5dd933c65c0f7029c157343c91269a17e892e76dd433a3 \
+  gcr.io/iree-oss/nvidia@sha256:c1053062515d6fe9becc38305a70dc3fba486068513e2102844b407488fb5629 \
   build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-turing/build.sh
 
 # Kokoro will rsync this entire directory back to the executor orchestrating the

diff --git a/docs/developers/developing_iree/llvm_version_bump.md b/docs/developers/developing_iree/llvm_version_bump.md
index 970bc31..b30b3d1 100644
--- a/docs/developers/developing_iree/llvm_version_bump.md
+++ b/docs/developers/developing_iree/llvm_version_bump.md

@@ -110,7 +110,7 @@
 
 ```
 cd ~/src
-git clone git clone https://github.com/tensorflow/tensorflow.git
+git clone https://github.com/tensorflow/tensorflow.git
 git clone https://github.com/tensorflow/mlir-hlo.git
 ```
 

diff --git a/docs/developers/developing_iree/profiling_with_tracy.md b/docs/developers/developing_iree/profiling_with_tracy.md
index 5b3e27e..cff3897 100644
--- a/docs/developers/developing_iree/profiling_with_tracy.md
+++ b/docs/developers/developing_iree/profiling_with_tracy.md

@@ -33,6 +33,7 @@
 ```shell
 sudo apt install libcapstone-dev libtbb-dev libzstd-dev
 ```
+The zstd version on Ubuntu 18.04 is old. You will need to install it from source from https://github.com/facebook/zstd.git
 
 ### Mac
 ```shell

diff --git a/docs/website/docs/blog/2021-07-19-tflite-tosa.md b/docs/website/docs/blog/2021-07-19-tflite-tosa.md
index 90929b6..9da0139 100644
--- a/docs/website/docs/blog/2021-07-19-tflite-tosa.md
+++ b/docs/website/docs/blog/2021-07-19-tflite-tosa.md

@@ -34,11 +34,11 @@
 ## Examples
 
 TFLite with IREE is available in Python and Java.  We have a
-[colab notebook](https://colab.sandbox.google.com/github/google/iree/blob/main/colab/tflite_text_classification.ipynb)
+[colab notebook](https://colab.research.google.com/github/google/iree/blob/main/colab/tflite_text_classification.ipynb)
 that shows how to use IREE’s python bindings and TFLite compiler tools to
 compile a pre-trained TFLite model from a flatbuffer and run using IREE.  We
 also have an
 [Android Java app](https://github.com/not-jenni/iree-android-tflite-demo) that
 was forked from an existing TFLite demo app, swapping out the TFLite library
 for our own AAR.  More information on IREE’s TFLite frontend is available
-[here](../ml-frameworks/tensorflow-lite.md).
+[here](../getting-started/tflite.md).

diff --git a/docs/website/docs/deployment-configurations/cpu-dylib.md b/docs/website/docs/deployment-configurations/cpu-dylib.md
index e6bc143..615de7c 100644
--- a/docs/website/docs/deployment-configurations/cpu-dylib.md
+++ b/docs/website/docs/deployment-configurations/cpu-dylib.md

@@ -135,5 +135,5 @@
 [pypi]: https://pypi.org/user/google-iree-pypi-deploy/
 [python-bindings]: ../bindings/python.md
 [tf-hub-mobilenetv2]: https://tfhub.dev/google/tf2-preview/mobilenet_v2/classification
-[tf-import]: ../ml-frameworks/tensorflow.md
-[tflite-import]: ../ml-frameworks/tensorflow-lite.md
+[tf-import]: ../getting-started/tensorflow.md
+[tflite-import]: ../getting-started/tensorflow-lite.md

diff --git a/docs/website/docs/deployment-configurations/gpu-cuda-rocm.md b/docs/website/docs/deployment-configurations/gpu-cuda-rocm.md
index b630b5e..9147799 100644
--- a/docs/website/docs/deployment-configurations/gpu-cuda-rocm.md
+++ b/docs/website/docs/deployment-configurations/gpu-cuda-rocm.md

@@ -1,6 +1,9 @@
-# CUDA and ROCM GPU HAL Driver
+# CUDA and ROCm GPU HAL Driver
 
-IREE can accelerate model execution on NVIDIA GPUs using CUDA and on AMD GPUs using ROCm. Due to the similarity of CUDA and ROCm APIs and infrastructure, the CUDA and ROCm backends share much of their implementation in IREE. The IREE compiler uses a similar GPU code generation pipeline for each, but generates PTX for CUDA and hsaco for ROCm. The IREE runtime HAL driver for ROCm mirrors the one for CUDA, except for the command graph - where CUDA has "direct", "stream", and "graph" command buffers, and ROCM has only "direct" command buffers.
+IREE can accelerate model execution on NVIDIA GPUs using CUDA and on AMD GPUs using ROCm. Due to the similarity of CUDA and ROCm APIs and infrastructure, the CUDA and ROCm backends share much of their implementation in IREE:
+
+* The IREE compiler uses a similar GPU code generation pipeline for each, but generates PTX for CUDA and hsaco for ROCm
+* The IREE runtime HAL driver for ROCm mirrors the one for CUDA, except for command buffers implementations - where CUDA has "direct", "stream", and "graph" command buffers, and ROCm has only "direct" command buffers
 
 ## Prerequisites
 
@@ -25,34 +28,21 @@
     rocm-smi | grep rocm
     ```
 
-    If `rocm-smi` does not exist, you will need to [install the latest ROCM Toolkit SDK][rocm-toolkit].
+    If `rocm-smi` does not exist, you will need to [install the latest ROCm Toolkit SDK][rocm-toolkit].
 
 ## Get runtime and compiler
 
-### Get IREE runtime with CUDA HAL driver
+### Get IREE runtime
 
-Next you will need to get an IREE runtime that supports the CUDA HAL driver
-so it can execute the model on GPU via CUDA for Nvidia. Or the ROCM HAL driver to execute models on AMD hardware
+Next you will need to get an IREE runtime that includes the CUDA (for Nvidia
+hardware) or ROCm (for AMD hardware) HAL driver.
 
 #### Build runtime from source
+
 Please make sure you have followed the [Getting started][get-started] page
-to build IREE for Linux/Windows.
-
-=== "Nvidia/CUDA"
-
-    The CUDA HAL driver is compiled in by default on non-Apple
-    platforms.
-
-    Ensure that the `IREE_HAL_DRIVER_CUDA` CMake option is `ON` when configuring
-    for the target.
-
-=== "AMD/ROCm"
-
-    Currently our support for ROCm/AMD hardware is still experimental. To enable it add:
-    ```
-    -DIREE_HAL_DRIVER_EXPERIMENTAL_ROCM=ON
-    ```
-    to the cmake build command.
+to build IREE from source, then enable the CUDA HAL driver with the
+`IREE_HAL_DRIVER_CUDA` option or the experimental ROCm HAL driver with the
+`IREE_HAL_DRIVER_EXPERIMENTAL_ROCM` option.
 
 #### Download as Python package
 
@@ -78,24 +68,13 @@
 #### Build compiler from source
 
 Please make sure you have followed the [Getting started][get-started] page
-to build IREE for Linux/Windows and the [Android cross-compilation][android-cc]
-page for Android. The CUDA compiler backend and ROCM compiler backend is compiled in by default on all
-platforms.
-
-=== "Nvidia/CUDA"
-
-    Ensure that the `IREE_TARGET_BACKEND_CUDA` CMake option is `ON` when
-    configuring for the host.
-
-=== "AMD/ROCM"
-
-    Ensure that the `IREE_TARGET_BACKEND_ROCM` CMake option is `ON` when
-    configuring for the host.
+to build the IREE compiler, then enable the CUDA compiler target with the
+`IREE_TARGET_BACKEND_CUDA` option or the ROCm compiler target with the
+`IREE_TARGET_BACKEND_ROCM` option.
 
 ## Compile and run the model
 
-With the compiler and runtime for CUDA ready, we can now compile a model
-and run it on the GPU.
+With the compiler and runtime ready, we can now compile a model and run it on the GPU.
 
 ### Compile the model
 
@@ -135,7 +114,7 @@
     Nvidia V100 | `sm_70`
     Nvidia A100 | `sm_80`
 
-=== "AMD/ROCM"
+=== "AMD/ROCm"
 
     ``` shell hl_lines="3-6"
     iree/tools/iree-translate \
@@ -149,7 +128,7 @@
 
     Note ROCm Bitcode Dir(`iree-rocm-bc-dir`) path is required. If the system you are compiling IREE in has ROCm installed, then the default value of `/opt/rocm/amdgcn/bitcode` will usually suffice. If you intend on building ROCm compiler in a non-ROCm capable system, please set `iree-rocm-bc-dir` to the absolute path where you might have saved the amdgcn bitcode.
 
-    Note that a rocm target chip(`iree-rocm-target-chip`) of the form `gfx<arch_number>` is needed
+    Note that a ROCm target chip(`iree-rocm-target-chip`) of the form `gfx<arch_number>` is needed
     to compile towards each GPU architecture. If no architecture is specified then we will default to `gfx908`
     Here are a table of commonly used architecture
 
@@ -176,7 +155,7 @@
         --function_input="1x224x224x3xf32=0"
     ```
 
-=== "AMD/ROCM"
+=== "AMD/ROCm"
 
     ``` shell hl_lines="2"
     iree/tools/iree-run-module \
@@ -196,7 +175,7 @@
 [pypi]: https://pypi.org/user/google-iree-pypi-deploy/
 [python-bindings]: ../bindings/python.md
 [tf-hub-mobilenetv2]: https://tfhub.dev/google/tf2-preview/mobilenet_v2/classification
-[tf-import]: ../ml-frameworks/tensorflow.md
-[tflite-import]: ../ml-frameworks/tensorflow-lite.md
+[tf-import]: ../getting-started/tensorflow.md
+[tflite-import]: ../getting-started/tensorflow-lite.md
 [cuda-toolkit]: https://developer.nvidia.com/cuda-downloads
 [rocm-toolkit]: https://rocmdocs.amd.com/en/latest/Installation_Guide/Installation_new.html

diff --git a/docs/website/docs/deployment-configurations/gpu-vulkan.md b/docs/website/docs/deployment-configurations/gpu-vulkan.md
index 07cfba5..041d0ea 100644
--- a/docs/website/docs/deployment-configurations/gpu-vulkan.md
+++ b/docs/website/docs/deployment-configurations/gpu-vulkan.md

@@ -203,7 +203,7 @@
 [python-bindings]: ../bindings/python.md
 [spirv]: https://www.khronos.org/registry/spir-v/
 [tf-hub-mobilenetv2]: https://tfhub.dev/google/tf2-preview/mobilenet_v2/classification
-[tf-import]: ../ml-frameworks/tensorflow.md
-[tflite-import]: ../ml-frameworks/tensorflow-lite.md
+[tf-import]: ../getting-started/tensorflow.md
+[tflite-import]: ../getting-started/tensorflow-lite.md
 [vulkan]: https://www.khronos.org/vulkan/
 [vulkan-sdk]: https://vulkan.lunarg.com/sdk/home/

diff --git a/docs/website/docs/getting-started/index.md b/docs/website/docs/getting-started/index.md
new file mode 100644
index 0000000..2429f1e
--- /dev/null
+++ b/docs/website/docs/getting-started/index.md

@@ -0,0 +1,60 @@
+# Getting Started Guide
+
+## Setup
+
+Use the following command for the default installation, or check out the
+comprehensive installation [guide](../bindings/python.md) if your needs are more complex.
+
+```
+python -m pip install \
+  iree-compiler \
+  iree-runtime \
+  iree-tools-tf \
+  iree-tools-tflite \
+  iree-tools-xla
+```
+
+## Supported frameworks
+
+See end-to-end examples of how to execute a variety models on IREE. This covers
+the import, compilation, and execution of the provided model.
+
+* [TensorFlow](./tensorflow.md)
+* [TensorFlow Lite](./tflite.md)
+* [JAX](./jax.md)
+
+Importing from PyTorch and other frameworks is planned - stay tuned!
+
+## Samples
+
+Check out the samples in IREE's [colab/directory](https://github.com/google/iree/tree/main/colab),
+as well as the [iree-samples repository](https://github.com/google/iree-samples)
+respository, which contains workflow comparisons across frameworks.
+
+## Import
+
+Importing models takes known file types and imports into a form that the core IREE
+compiler is able to ingest. This import process is specific to each frontend and typically
+involves a number of stages:
+
+* Load the source format
+* Legalize operations specific each specific frontend to legal IR
+* Validate only IREE compatible operations remain
+* Write the remaining IR to a file
+
+This fully legalized form can then be compiled without dependencies on the 
+source model language.
+
+## Compilation
+
+During compilation we load an MLIR file and compile for the specified set of backends
+(CPU, GPU, etc).  Each of these backends creates custom native code to execute on the
+target device.  Once compiled, the resulting bytecode is exported to an IREE bytecode
+file that can be executed on the specified devices.
+
+## Execution
+
+The final stage is executing the now compiled module. This involves selecting what
+compute devices should be used, loading the module, and executing the module with the
+intended inputs. For testing, IREE includes a Python API. However, on mobile and embedded devices you
+will want to use the [C API](../deployment-configurations/index.md).

diff --git a/docs/website/docs/ml-frameworks/jax.md b/docs/website/docs/getting-started/jax.md
similarity index 100%
rename from docs/website/docs/ml-frameworks/jax.md
rename to docs/website/docs/getting-started/jax.md


diff --git a/docs/website/docs/ml-frameworks/tensorflow.md b/docs/website/docs/getting-started/tensorflow.md
similarity index 100%
rename from docs/website/docs/ml-frameworks/tensorflow.md
rename to docs/website/docs/getting-started/tensorflow.md


diff --git a/docs/website/docs/getting-started/tflite-cmd.md b/docs/website/docs/getting-started/tflite-cmd.md
new file mode 100644
index 0000000..a9bdb89
--- /dev/null
+++ b/docs/website/docs/getting-started/tflite-cmd.md

@@ -0,0 +1,33 @@
+# TFLite via Command Line
+
+IREE's tooling is divided into two components: import and compilation.
+
+1. The import tool converts the TFLite flatbuffer to an IREE compatible form,
+validating that only IREE compatible operations remain. Containing a combination of TOSA
+and IREE operations.
+2. The compilation stage generates the bytecode module for a list of targets, which can
+be executed by IREE.
+
+These two stages can be completed entirely via the command line.
+
+```shell
+WORKDIR="/tmp/workdir"
+TFLITE_URL="https://storage.googleapis.com/iree-model-artifacts/tflite-integration-tests/posenet_i8.tflite"
+TFLITE_PATH=${WORKDIR}/model.tflite
+IMPORT_PATH=${WORKDIR}/tosa.mlir
+MODULE_PATH=${WORKDIR}/module.vmfb
+
+# Fetch the sample model
+wget ${TFLITE_URL} -O ${TFLITE_PATH}
+
+# Import the sample model to an IREE compatible form
+iree-import-tflite ${TFLITE_PATH} -o ${IMPORT_PATH}
+
+# Compile for the CPU backend
+iree-translate \
+    --iree-mlir-to-vm-bytecode-module \
+    --iree-input-type=tosa \
+    --iree-hal-target-backends=dylib-llvm-aot \
+    ${IMPORT_PATH} \
+    -o ${MODULE_PATH}
+```

diff --git a/docs/website/docs/getting-started/tflite-python.md b/docs/website/docs/getting-started/tflite-python.md
new file mode 100644
index 0000000..a46ef10
--- /dev/null
+++ b/docs/website/docs/getting-started/tflite-python.md

@@ -0,0 +1,79 @@
+# TFLite via Python
+
+The example below demonstrates downloading, compiling, and executing a TFLite
+model using the Python API. This includes some initial setup to declare global
+variables, download the sample module, and download the sample inputs.
+
+Declaration of absolute paths for the sample repo and import all required libraries.
+The default setup uses the CPU backend as the only target. This can be reconfigured
+to select alternative targets.
+
+```python
+import iree.compiler.tflite as iree_tflite_compile
+import iree.runtime as iree_rt
+import numpy
+import os
+import urllib.request
+
+from PIL import Image
+
+workdir = "/tmp/workdir"
+os.makedirs(workdir, exist_ok=True)
+
+tfliteFile = "/".join([workdir, "model.tflite"])
+jpgFile = "/".join([workdir, "input.jpg"])
+tfliteIR = "/".join([workdir, "tflite.mlir"])
+tosaIR = "/".join([workdir, "tosa.mlir"])
+bytecodeModule = "/".join([workdir, "iree.vmfb"])
+
+backends = ["dylib-llvm-aot"]
+config = "dylib"
+```
+
+The TFLite sample model and input are downloaded locally.
+
+```python
+tfliteUrl = "https://storage.googleapis.com/iree-model-artifacts/tflite-integration-tests/posenet_i8.tflite"
+jpgUrl = "https://storage.googleapis.com/iree-model-artifacts/tflite-integration-tests/posenet_i8_input.jpg"
+
+urllib.request.urlretrieve(tfliteUrl, tfliteFile)
+urllib.request.urlretrieve(jpgUrl, jpgFile)
+```
+
+Once downloaded we can compile the model for the selected backends. Both the TFLite and TOSA representations
+of the model are saved for debugging purposes. This is optional and can be omitted.
+
+```python
+iree_tflite_compile.compile_file(
+  tfliteFile,
+  input_type="tosa",
+  output_file=bytecodeModule,
+  save_temp_tfl_input=tfliteIR,
+  save_temp_iree_input=tosaIR,
+  target_backends=backends,
+  import_only=False)
+```
+
+After compilation is completed we configure the VmModule using the dylib configuration and compiled
+IREE module.
+
+```python
+config = iree_rt.Config("dylib")
+context = iree_rt.SystemContext(config=config)
+with open(bytecodeModule, 'rb') as f:
+  vm_module = iree_rt.VmModule.from_flatbuffer(f.read())
+  context.add_vm_module(vm_module)
+```
+
+Finally, the IREE module is loaded and ready for execution. Here we load the sample image, manipulate to
+the expected input size, and execute the module. By default TFLite models include a single
+function named 'main'. The final results are printed.
+
+```python
+im = numpy.array(Image.open(jpgFile).resize((192, 192))).reshape((1, 192, 192, 3))
+args = [im]
+
+invoke = context.modules.module["main"]
+iree_results = invoke(*args)
+print(iree_results)
+```

diff --git a/docs/website/docs/getting-started/tflite.md b/docs/website/docs/getting-started/tflite.md
new file mode 100644
index 0000000..c276350
--- /dev/null
+++ b/docs/website/docs/getting-started/tflite.md

@@ -0,0 +1,54 @@
+# TFLite Integration
+
+IREE supports compiling and running TensorFlow Lite programs stored as [TFLite
+flatbuffers](https://www.tensorflow.org/lite/guide). These files can be
+imported into an IREE-compatible format then compiled to a series of backends.
+
+## Prerequisites
+
+Install TensorFlow-Lite specific dependencies using pip:
+
+```shell
+python -m pip install \
+  iree-compiler \
+  iree-runtime \
+  iree-tools-tflite
+```
+
+- [Command Line](./tflite-cmd.md)
+- [Python API](./tflite-python.md)
+
+## Troubleshooting
+
+Failures during the import step usually indicate a failure to lower from 
+TensorFlow Lite's operations to TOSA, the intermediate representation used by
+IREE. Many TensorFlow Lite operations are not fully supported, particularly
+those than use dynamic shapes. File an issue to IREE's TFLite model support
+[project](https://github.com/google/iree/projects/42). 
+
+
+## Additional Samples
+
+* The
+[tflitehub folder](https://github.com/google/iree-samples/tree/main/tflitehub)
+in the [iree-samples repository](https://github.com/google/iree-samples)
+contains test scripts to compile, run, and compare various TensorFlow Lite
+models sourced from [TensorFlow Hub](https://tfhub.dev/).
+
+* An example smoke test of the
+[TensorFlow Lite C API](https://github.com/google/iree/tree/main/bindings/tflite)
+is available
+[here](https://github.com/google/iree/blob/main/bindings/tflite/smoke_test.cc).
+
+| Colab notebooks |  |
+| -- | -- |
+Text classification with TFLite and IREE | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/google/iree/blob/main/colab/tflite_text_classification.ipynb)
+
+!!! todo
+
+    [Issue#3954](https://github.com/google/iree/issues/3954): Add documentation
+    for an Android demo using the
+    [Java TFLite bindings](https://github.com/google/iree/tree/main/bindings/tflite/java),
+    once it is complete at
+    [not-jenni/iree-android-tflite-demo](https://github.com/not-jenni/iree-android-tflite-demo).
+

diff --git a/docs/website/docs/index.md b/docs/website/docs/index.md
index 60e9497..5523826 100644
--- a/docs/website/docs/index.md
+++ b/docs/website/docs/index.md

@@ -64,12 +64,14 @@
 
 ## Workflow overview
 
-Using IREE involves these general steps:
+Specific examples outlining IREE's workflow can be found in the
+[User Getting Started Guide](./getting-started/index.md). Using IREE involves the following
+general steps:
 
 1. **Import your model**
 
-    Work in your [framework of choice](./ml-frameworks), then run your model
-    through one of IREE's import tools.
+    Develop your program using one of the [supported frameworks](./getting-started/#supported-frameworks), then run your model
+    using one of IREE's import tools.
 
 2. **Select your [deployment configuration](./deployment-configurations)**
 
@@ -89,9 +91,9 @@
 IREE supports importing models from a growing list of ML frameworks and model
 formats:
 
-* [TensorFlow](ml-frameworks/tensorflow.md)
-* [TensorFlow Lite](ml-frameworks/tensorflow-lite.md)
-* [JAX](ml-frameworks/jax.md)
+* [TensorFlow](getting-started/tensorflow.md)
+* [TensorFlow Lite](getting-started/tflite.md)
+* [JAX](getting-started/jax.md)
 
 ### Selecting deployment configurations
 

diff --git a/docs/website/docs/ml-frameworks/index.md b/docs/website/docs/ml-frameworks/index.md
deleted file mode 100644
index ebdae58..0000000
--- a/docs/website/docs/ml-frameworks/index.md
+++ /dev/null

@@ -1,18 +0,0 @@
-# ML frameworks
-
-## Supported frameworks
-
-IREE supports importing models from
-
-* [TensorFlow](./tensorflow.md)
-* [TensorFlow Lite](./tensorflow-lite.md)
-* [JAX](./jax.md)
-
-Importing from PyTorch and other frameworks is planned - stay tuned!
-
-## Samples
-
-Check out the samples in IREE's
-[colab/ directory](https://github.com/google/iree/tree/main/colab) and the
-[iree-samples repository](https://github.com/google/iree-samples) for examples
-and workflow comparisons across frameworks.

diff --git a/docs/website/docs/ml-frameworks/tensorflow-lite.md b/docs/website/docs/ml-frameworks/tensorflow-lite.md
deleted file mode 100644
index 21c1c3b..0000000
--- a/docs/website/docs/ml-frameworks/tensorflow-lite.md
+++ /dev/null

@@ -1,96 +0,0 @@
-# TensorFlow Lite Integration
-
-IREE supports compiling and running pre-trained TensorFlow Lite (TFLite)
-models.  It converts a model to
-[TOSA MLIR](https://mlir.llvm.org/docs/Dialects/TOSA/), then compiles it into a
-VM module.
-
-## Prerequisites
-
-Download a pre-trained TFLite model from the list of
-[hosted models](https://www.tensorflow.org/lite/guide/hosted_models), or use the
-[TensorFlow Lite converter](https://www.tensorflow.org/lite/convert) to convert
-a TensorFlow model to a .tflite flatbuffer.
-
-Install IREE pip packages, either from pip or by
-[building from source](../building-from-source/python-bindings-and-importers.md):
-
-```shell
-python -m pip install \
-  iree-compiler \
-  iree-runtime \
-  iree-tools-tflite
-```
-
-!!! warning
-    The TensorFlow Lite package is currently only available on Linux and macOS.
-    It is not available on Windows yet (see
-    [this issue](https://github.com/google/iree/issues/6417)).
-
-## Importing models
-
-Fist, import the TFLite model to TOSA MLIR:
-
-```shell
-iree-import-tflite \
-  sample.tflite \
-  -o sample.mlir
-```
-
-Next, compile the TOSA MLIR to a VM flatbuffer, using either the command line
-tools or the [Python API](https://google.github.io/iree/bindings/python/):
-
-#### Using the command-line tool
-
-``` shell
-iree-translate \
-  --iree-mlir-to-vm-bytecode-module \
-  --iree-input-type=tosa \
-  --iree-hal-target-backends=vmvx \
-  sample.mlir \
-  -o sample.vmfb
-```
-
-#### Using the python API
-
-``` python
-from iree.compiler import compile_str
-with open('sample.mlir') as sample_tosa_mlir:
-  compiled_flatbuffer = compile_str(sample_tosa_mlir.read(),
-    input_type="tosa",
-    target_backends=["vmvx"],
-    extra_args=["--iree-native-bindings-support=false",
-      "--iree-tflite-bindings-support"])
-```
-
-!!! todo
-
-    [Issue#5462](https://github.com/google/iree/issues/5462): Link to
-    TensorFlow Lite bindings documentation once it has been written.
-
-The flatbuffer can then be loaded to a VM module and run through IREE's runtime.
-
-## Samples
-
-* The
-[tflitehub folder](https://github.com/google/iree-samples/tree/main/tflitehub)
-in the [iree-samples repository](https://github.com/google/iree-samples)
-contains test scripts to compile, run, and compare various TensorFlow Lite
-models sourced from [TensorFlow Hub](https://tfhub.dev/).
-
-* An example smoke test of the
-[TensorFlow Lite C API](https://github.com/google/iree/tree/main/bindings/tflite)
-is available
-[here](https://github.com/google/iree/blob/main/bindings/tflite/smoke_test.cc).
-
-| Colab notebooks |  |
-| -- | -- |
-Text classification with TFLite and IREE | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/google/iree/blob/main/colab/tflite_text_classification.ipynb)
-
-!!! todo
-
-    [Issue#3954](https://github.com/google/iree/issues/3954): Add documentation
-    for an Android demo using the
-    [Java TFLite bindings](https://github.com/google/iree/tree/main/bindings/tflite/java),
-    once it is complete at
-    [not-jenni/iree-android-tflite-demo](https://github.com/not-jenni/iree-android-tflite-demo).

diff --git a/docs/website/mkdocs.yml b/docs/website/mkdocs.yml
index 3c9e1fc..c7da8e5 100644
--- a/docs/website/mkdocs.yml
+++ b/docs/website/mkdocs.yml

@@ -94,11 +94,11 @@
 # Note: may include external links and titles are optional for internal links
 nav:
   - Home: 'index.md'
-  - 'ML frameworks':
-      - 'ml-frameworks/index.md'
-      - TensorFlow: 'ml-frameworks/tensorflow.md'
-      - TensorFlow Lite: 'ml-frameworks/tensorflow-lite.md'
-      - JAX: 'ml-frameworks/jax.md'
+  - 'Getting Started':
+      - 'getting-started/index.md'
+      - TensorFlow: 'getting-started/tensorflow.md'
+      - TensorFlow Lite: 'getting-started/tflite.md'
+      - JAX: 'getting-started/jax.md'
   - 'Deployment configurations':
       - 'deployment-configurations/index.md'
       - CPU - Dylib: 'deployment-configurations/cpu-dylib.md'

diff --git a/experimental/sample_web_static/CMakeLists.txt b/experimental/sample_web_static/CMakeLists.txt
index 2cf9582..cedfd93 100644
--- a/experimental/sample_web_static/CMakeLists.txt
+++ b/experimental/sample_web_static/CMakeLists.txt

@@ -49,8 +49,9 @@
   #
   "-sASSERTIONS=1"
   #
-  "-gsource-map"
-  "-source-map-base="
+  # https://developer.chrome.com/blog/wasm-debugging-2020/
+  "-g"
+  "-gseparate-dwarf"
 )
 
 #-------------------------------------------------------------------------------
@@ -91,8 +92,9 @@
   #
   "-sASSERTIONS=1"
   #
-  "-gsource-map"
-  "-source-map-base="
+  # https://developer.chrome.com/blog/wasm-debugging-2020/
+  "-g"
+  "-gseparate-dwarf"
   #
   # https://emscripten.org/docs/porting/pthreads.html#compiling-with-pthreads-enabled
   "-pthread"

diff --git a/experimental/sample_web_static/build_static_emscripten_demo.sh b/experimental/sample_web_static/build_static_emscripten_demo.sh
index e91b96c..cb98b85 100644
--- a/experimental/sample_web_static/build_static_emscripten_demo.sh
+++ b/experimental/sample_web_static/build_static_emscripten_demo.sh

@@ -67,6 +67,7 @@
 # Note: The sample creates a task device directly, so no drivers are required,
 #       but some targets are gated on specific CMake options.
 emcmake "${CMAKE_BIN?}" -G Ninja .. \
+  -DCMAKE_BUILD_TYPE=RelWithDebInfo \
   -DIREE_HOST_BINARY_ROOT=$PWD/../build-host/install \
   -DIREE_BUILD_EXPERIMENTAL_WEB_SAMPLES=ON \
   -DIREE_HAL_DRIVER_DEFAULTS=OFF \
@@ -83,20 +84,16 @@
 # Serve the demo using a local webserver                                      #
 ###############################################################################
 
-echo "=== Copying static files (index.html) to the build directory ==="
+echo "=== Copying static files to the build directory ==="
 
 cp ${ROOT_DIR?}/experimental/sample_web_static/index.html ${BINARY_DIR}
+cp ${ROOT_DIR?}/experimental/sample_web_static/iree_api.js ${BINARY_DIR}
+cp ${ROOT_DIR?}/experimental/sample_web_static/iree_worker.js ${BINARY_DIR}
 
-echo "=== Running local webserver ==="
-echo "    open at http://localhost:8000/build-emscripten/experimental/sample_web_static/"
+EASELJS_LIBRARY=${BINARY_DIR}/easeljs.min.js
+test -f ${EASELJS_LIBRARY} || \
+    wget https://code.createjs.com/1.0.0/easeljs.min.js -O ${EASELJS_LIBRARY}
 
-# **Note**: this serves from the root so source maps can reference code in the
-# source tree. A real deployment would bundle the output artifacts and serve
-# them from a build/release directory.
+echo "=== Running local webserver, open at http://localhost:8000/ ==="
 
-# local_server.py is needed when using SharedArrayBuffer, with multithreading
-# python3 local_server.py --directory ${ROOT_DIR?}
-
-# http.server on its own is fine for single threaded use, and this doesn't
-# break CORS for external resources like easeljs from a CDN
-python3 -m http.server --directory ${ROOT_DIR?}
+python3 ${ROOT_DIR?}/scripts/local_web_server.py --directory ${BINARY_DIR}

diff --git a/experimental/sample_web_static/index.html b/experimental/sample_web_static/index.html
index d3da38f..248b6a6 100644
--- a/experimental/sample_web_static/index.html
+++ b/experimental/sample_web_static/index.html

@@ -14,8 +14,8 @@
   <title>IREE Static Web Sample</title>
   <meta name="viewport" content="width=device-width, initial-scale=1">
 
-  <!-- TODO(scotttodd): use local copy for CORS webserver / SharedArrayBuffer workarounds? -->
-  <script src="https://code.createjs.com/1.0.0/easeljs.min.js"></script>
+  <script src="./easeljs.min.js"></script>
+  <script src="./iree_api.js"></script>
 </head>
 
 <body style="background-color: #2b2c30; color: #ABB2BF">
@@ -30,120 +30,53 @@
   </canvas>
 
   <br>
-  <div style="border:2px solid #000000; background-color: #CCCCCC; padding: 8px; color: #111111" width="400px" height="300px">
+  <div style="border:2px solid #000000; background-color: #CCCCCC; padding: 8px; color: #111111; width:440px">
     <button id="predictButton" disabled onclick="predictDigit()">Predict handwritten digit</button>
+    <button id="clearCanvasButton" onclick="clearCanvas()">Clear canvas</button>
     <br>
-    Prediction result: <div id="predictionResult"></div>
+    Prediction result: <div id="predictionResult" style="display:inline"></div>
   </div>
 
   <script>
-    let setupNativeSample;
-    let cleanupNativeSample;
-    let runNativeSample;
-    let nativeState;
-    const predictionResultElement = document.getElementById("predictionResult");
-    const predictButtonElement = document.getElementById("predictButton");
-    let initialized = false;
-
-    const imagePixelCount = 28 * 28;
-    let imageBuffer;
-
-    var Module = {
-      print: function(text) {
-        console.log(text);
-      },
-      printErr: function(text) {
-        console.error(text);
-      },
-      onRuntimeInitialized: function() {
-        console.log("WebAssembly module onRuntimeInitialized()");
-
-        setupNativeSample = Module.cwrap("setup_sample", "number", []);
-        cleanupNativeSample = Module.cwrap("cleanup_sample", null, ["number"]);
-        runNativeSample = Module.cwrap("run_sample", "number", ["number", "number"]);
-
-        setupSample();
-      },
-      // https://emscripten.org/docs/api_reference/module.html#Module.noInitialRun
-      noInitialRun: true,
-    };
-
-    function setupSample() {
-      nativeState = setupNativeSample();
-      predictButtonElement.disabled = false;
-      imageBuffer = Module._malloc(imagePixelCount * Float32Array.BYTES_PER_ELEMENT);
-      initialized = true;
-    }
-
-    // TODO(scotttodd): call this on page suspend?
-    function cleanupSample() {
-      initialized = false;
-      Module._free(imageDataBuffer);
-      predictButtonElement.disabled = true;
-      cleanupNativeSample();
-      nativeState = null;
-    }
-
-    function predictDigit() {
-      const rawImageData = getRescaledCanvasData();
-      preprocessImageData(rawImageData);
-
-      result = runNativeSample(nativeState, imageBuffer);
-      if (result != -1) {
-        predictionResultElement.innerHTML = result;
-      } else {
-        predictionResultElement.innerHTML = "Error";
-      }
-    }
-
-    // https://becominghuman.ai/passing-and-returning-webassembly-array-parameters-a0f572c65d97
-    // https://developers.google.com/web/updates/2018/03/emscripting-a-c-library#get_an_image_from_javascript_into_wasm
-    function preprocessImageData(rawImageData) {
-      // * getImageData() returns a Uint8ClampedArray with RGBA image data
-      // * this MNIST model takes tensor<1x28x28x1xf32> with grayscale pixels
-      //   in [0.0, 1.0]
-
-      // This conversion is terrible, but this is a toy demo with a small image
-      // Hopefully there aren't any logic / iteration order issues...
-      const typedArray = new Float32Array(imagePixelCount);
-      for (let y = 0; y < 28; ++y) {
-        for (let x = 0; x < 28; ++x) {
-          const typedIndex = y * 28 + x;
-          const rawIndex = 4 * (y * 28 + x) + 3;  // Assume colorSpace srgb
-          typedArray[typedIndex] = rawImageData.data[rawIndex] / 255.0;
-        }
-      }
-
-      // Copy into Wasm heap.
-      // Note: we could have done the conversion in-place, but this is demo code
-      Module.HEAPF32.set(typedArray, imageBuffer >> 2);
-    }
-
-  </script>
-  <script src="sample-web-static-sync.js"></script>
-  <!-- <script src="sample-web-static-multithreaded.js"></script> -->
-
-
-  <script>
-    // Forked from:
+    // <canvas> drawing using easeljs forked from:
     //   https://createjs.com/demos/easeljs/curveto
     //   https://github.com/CreateJS/EaselJS/blob/master/examples/CurveTo.html
 
-    let drawingCanvasElement;
-    let rescaledCanvasElement, rescaledCanvasContext;
+    const predictButtonElement = document.getElementById('predictButton');
+    const predictionResultElement = document.getElementById('predictionResult');
+    const drawingCanvasElement = document.getElementById("drawingCanvas");
+    const rescaledCanvasElement = document.getElementById("rescaledCanvas");
+    const rescaledCanvasContext = rescaledCanvasElement.getContext("2d");
     let stage;
     let drawingCanvasShape;
     let oldPt, oldMidPt;
     let titleText;
+    let ireeInitialized = false;
     const primaryColor = "#000000";
     const eraseColor = "#FFFFFF";
     const stroke = 32;
 
-    function initDrawing() {
-      drawingCanvasElement = document.getElementById("drawingCanvas");
+    function predictDigit() {
+      // TODO(scotttodd): debounce / rate limit this?
+      ireePredictDigit(getRescaledCanvasData()).then((result) => {
+        predictionResultElement.innerHTML = result;
+      }).catch((error) => {
+        predictionResultElement.innerHTML = "<b>" + error + "</b>";
+      });
+    }
 
-      rescaledCanvasElement = document.getElementById("rescaledCanvas");
-      rescaledCanvasContext = rescaledCanvasElement.getContext("2d");
+    function clearCanvas() {
+      stage.clear();
+      stage.removeAllChildren();
+
+      drawingCanvasShape = new createjs.Shape();
+      stage.addChild(drawingCanvasShape);
+      stage.update();
+
+      updateRescaledCanvas();
+    }
+
+    function initDrawing() {
       rescaledCanvasContext.imageSmoothingEnabled = false;
       rescaledCanvasContext.mozImageSmoothingEnabled = false;
       rescaledCanvasContext.webkitImageSmoothingEnabled = false;
@@ -204,8 +137,7 @@
       stage.update();
       updateRescaledCanvas();
 
-      if (initialized) {
-        // TODO(scotttodd): debounce / rate limit this
+      if (ireeInitialized) {
         predictDigit();
       }
     }
@@ -216,6 +148,8 @@
     }
 
     function updateRescaledCanvas() {
+      rescaledCanvasContext.clearRect(
+        0, 0, rescaledCanvasElement.width, rescaledCanvasElement.height);
       rescaledCanvasContext.drawImage(
           drawingCanvasElement,
           /*sx=*/0, /*sy=*/0,
@@ -229,6 +163,13 @@
     }
 
     initDrawing();
+
+    ireeInitializeWorker().then((result) => {
+      predictButtonElement.disabled = false;
+      ireeInitialized = true;
+    }).catch((error) => {
+      console.error("Failed to initialize IREE, error: '" + error + "'");
+    });
   </script>
 </body>
 

diff --git a/experimental/sample_web_static/iree_api.js b/experimental/sample_web_static/iree_api.js
new file mode 100644
index 0000000..f219e44
--- /dev/null
+++ b/experimental/sample_web_static/iree_api.js

@@ -0,0 +1,74 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Promise-based API for interacting with the IREE runtime.
+
+let ireeWorker = null;
+let nextMessageId = 0;
+const pendingPromises = {};
+
+// Communication protocol to and from the worker:
+// {
+//     'messageType': string
+//         * the type of message (initialized, predict, etc.)
+//     'id': number?
+//         * optional id to disambiguate messages of the same type
+//     'payload': Object?
+//         * optional message data, format defined by message type
+//     'error': string?
+//         * optional error message
+// }
+
+function _handleMessageFromWorker(messageEvent) {
+  const {messageType, id, payload, error} = messageEvent.data;
+
+  if (messageType == 'initialized') {
+    pendingPromises['initialize']['resolve']();
+    delete pendingPromises['initialize'];
+  } else if (messageType == 'predictResult') {
+    if (payload) {
+      pendingPromises[id]['resolve'](payload);
+    } else {
+      pendingPromises[id]['reject'](error);
+    }
+    delete pendingPromises[id];
+  }
+}
+
+// Initializes IREE's web worker asynchronously.
+// Resolves when the worker is fully initialized.
+function ireeInitializeWorker() {
+  return new Promise((resolve, reject) => {
+    pendingPromises['initialize'] = {
+      'resolve': resolve,
+      'reject': reject,
+    };
+
+    ireeWorker = new Worker('iree_worker.js');
+    ireeWorker.onmessage = _handleMessageFromWorker;
+  });
+}
+
+// Predicts the handwritten digit in a provided image asynchronously.
+// Input: 28x28 pixel data from CanvasRenderingContext2D.getImageData()
+// Resolves with a Number in [0, 9] (inclusive) on success
+function ireePredictDigit(imageData) {
+  return new Promise((resolve, reject) => {
+    const messageId = nextMessageId++;
+    const message = {
+      'messageType': 'predict',
+      'id': messageId,
+      'payload': imageData,
+    };
+
+    pendingPromises[messageId] = {
+      'resolve': resolve,
+      'reject': reject,
+    };
+
+    ireeWorker.postMessage(message);
+  });
+}

diff --git a/experimental/sample_web_static/iree_worker.js b/experimental/sample_web_static/iree_worker.js
new file mode 100644
index 0000000..61b007d
--- /dev/null
+++ b/experimental/sample_web_static/iree_worker.js

@@ -0,0 +1,108 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+let wasmSetupSampleFn;
+let wasmCleanupSampleFn;
+let wasmRunSampleFn;
+let wasmState;
+let initialized = false;
+
+const IMAGE_PIXEL_COUNT = 28 * 28;
+const imageTypedArray = new Float32Array(IMAGE_PIXEL_COUNT);
+let imageBuffer;
+
+var Module = {
+  print: function(text) {
+    console.log('(C)', text);
+  },
+  printErr: function(text) {
+    console.error('(C)', text);
+  },
+  onRuntimeInitialized: function() {
+    console.log('WebAssembly module onRuntimeInitialized()');
+
+    wasmSetupSampleFn = Module.cwrap('setup_sample', 'number', []);
+    wasmCleanupSampleFn = Module.cwrap('cleanup_sample', null, ['number']);
+    wasmRunSampleFn =
+        Module.cwrap('run_sample', 'number', ['number', 'number']);
+
+    initializeSample();
+  },
+  noInitialRun: true,
+};
+
+function initializeSample() {
+  wasmState = wasmSetupSampleFn();
+  imageBuffer =
+      Module._malloc(IMAGE_PIXEL_COUNT * Float32Array.BYTES_PER_ELEMENT);
+  initialized = true;
+
+  postMessage({
+    'messageType': 'initialized',
+  });
+}
+
+// TODO(scotttodd): call this on page suspend?
+function cleanupSample() {
+  initialized = false;
+  Module._free(imageDataBuffer);
+  wasmCleanupSampleFn();
+  wasmState = null;
+}
+
+// https://becominghuman.ai/passing-and-returning-webassembly-array-parameters-a0f572c65d97
+// https://developers.google.com/web/updates/2018/03/emscripting-a-c-library#get_an_image_from_javascript_into_wasm
+function preprocessImageDataIntoHeap(rawImageData) {
+  // rawImageData is a Uint8ClampedArray with RGBA image data
+  // * this MNIST model takes tensor<1x28x28x1xf32> with grayscale pixels
+  //   in [0.0, 1.0]
+
+  // This conversion is terrible, but this is a toy demo with a small image
+  // Hopefully there aren't any logic / iteration order issues...
+  for (let y = 0; y < 28; ++y) {
+    for (let x = 0; x < 28; ++x) {
+      const typedIndex = y * 28 + x;
+      const rawIndex = 4 * (y * 28 + x) + 3;  // Assume colorSpace srgb
+      imageTypedArray[typedIndex] = rawImageData.data[rawIndex] / 255.0;
+    }
+  }
+
+  // Copy into Wasm heap.
+  // Note: we could have done the conversion in-place, but this is demo code
+  Module.HEAPF32.set(imageTypedArray, imageBuffer >> 2);
+}
+
+function handlePredict(id, canvasData) {
+  if (!initialized) return;
+
+  preprocessImageDataIntoHeap(canvasData);
+  result = wasmRunSampleFn(wasmState, imageBuffer);
+
+  if (result == -1) {
+    postMessage({
+      'messageType': 'predictResult',
+      'id': id,
+      'error': 'Wasm module error, check console for details',
+    });
+  } else {
+    postMessage({
+      'messageType': 'predictResult',
+      'id': id,
+      'payload': result,
+    });
+  }
+}
+
+onmessage = function(messageEvent) {
+  const {messageType, id, payload} = messageEvent.data;
+
+  if (messageType == 'predict') {
+    handlePredict(id, payload);
+  }
+};
+
+importScripts('sample-web-static-sync.js');
+// importScripts('sample-web-static-multithreaded.js');

diff --git a/integrations/tensorflow/test/python/iree_tfl_tests/mobilenet_v1_test.py b/integrations/tensorflow/test/python/iree_tfl_tests/mobilenet_v1_test.py
index 7286eff..ab9f15d 100644
--- a/integrations/tensorflow/test/python/iree_tfl_tests/mobilenet_v1_test.py
+++ b/integrations/tensorflow/test/python/iree_tfl_tests/mobilenet_v1_test.py

@@ -6,7 +6,7 @@
 
 import absl.testing
 import numpy
-import iree.tflite.support.test_util as test_util
+from . import test_util
 
 model_path = "https://storage.googleapis.com/iree-model-artifacts/tflite-integration-tests/mobilenet_v1.tflite"
 

diff --git a/integrations/tensorflow/test/python/iree_tfl_tests/posenet_i8_test.py b/integrations/tensorflow/test/python/iree_tfl_tests/posenet_i8_test.py
index b40ae5e..3ec3540 100644
--- a/integrations/tensorflow/test/python/iree_tfl_tests/posenet_i8_test.py
+++ b/integrations/tensorflow/test/python/iree_tfl_tests/posenet_i8_test.py

@@ -5,7 +5,7 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 import absl.testing
-import iree.tflite.support.test_util as test_util
+from . import test_util
 import numpy
 import urllib.request
 

diff --git a/integrations/tensorflow/python_projects/iree_tflite/iree/tflite/support/test_util.py b/integrations/tensorflow/test/python/iree_tfl_tests/test_util.py
similarity index 100%
rename from integrations/tensorflow/python_projects/iree_tflite/iree/tflite/support/test_util.py
rename to integrations/tensorflow/test/python/iree_tfl_tests/test_util.py


diff --git a/iree/base/tracing.cc b/iree/base/tracing.cc
index 870eabd..32e2826 100644
--- a/iree/base/tracing.cc
+++ b/iree/base/tracing.cc

@@ -203,21 +203,3 @@
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
-
-#if defined(__cplusplus) &&                                               \
-    (IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_ALLOCATION_TRACKING) && \
-    !IREE_SANITIZER_ADDRESS && !IREE_SANITIZER_MEMORY &&                  \
-    !IREE_SANITIZER_THREAD
-
-void* operator new(size_t count) noexcept {
-  auto ptr = malloc(count);
-  IREE_TRACE_ALLOC(ptr, count);
-  return ptr;
-}
-
-void operator delete(void* ptr) noexcept {
-  IREE_TRACE_FREE(ptr);
-  free(ptr);
-}
-
-#endif  // __cplusplus && IREE_TRACING_FEATURE_ALLOCATION_TRACKING

diff --git a/iree/base/tracing.h b/iree/base/tracing.h
index 4050c88..f1bab69 100644
--- a/iree/base/tracing.h
+++ b/iree/base/tracing.h

@@ -29,6 +29,10 @@
 #include <stdint.h>
 #include <stdlib.h>
 
+#if defined(__cplusplus)
+#include <new>
+#endif
+
 #include "iree/base/attributes.h"
 #include "iree/base/config.h"
 
@@ -178,6 +182,16 @@
 #undef TRACY_CALLSTACK
 #endif  // IREE_TRACING_MAX_CALLSTACK_DEPTH
 
+// By default, hook the C++ new and delete operators if allocation tracing
+// is enabled. We allow this to be controlled indendently because not all
+// C++ code is well behaved with respect to allocations, and Tracy gets
+// very angry when unbalanced. Such code can:
+//   #define IREE_TRACING_HOOK_CPP_NEW_DELETE 0
+// before including this file.
+#if !defined(IREE_TRACING_HOOK_CPP_NEW_DELETE)
+#define IREE_TRACING_HOOK_CPP_NEW_DELETE 1
+#endif
+
 //===----------------------------------------------------------------------===//
 // C API used for Tracy control
 //===----------------------------------------------------------------------===//
@@ -466,10 +480,26 @@
 #define IREE_TRACE_FREE_NAMED(name, ptr)
 #endif  // IREE_TRACING_FEATURE_ALLOCATION_TRACKING
 
-#if defined(__cplusplus) && \
-    (IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_ALLOCATION_TRACKING)
-void* operator new(size_t count) noexcept;
-void operator delete(void* ptr) noexcept;
+#if defined(__cplusplus) && IREE_TRACING_HOOK_CPP_NEW_DELETE &&           \
+    (IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_ALLOCATION_TRACKING) && \
+    (!IREE_SANITIZER_ADDRESS && !IREE_SANITIZER_MEMORY &&                 \
+     !IREE_SANITIZER_THREAD)
+inline void* operator new(size_t count, const std::nothrow_t&) noexcept {
+  auto ptr = malloc(count);
+  IREE_TRACE_ALLOC(ptr, count);
+  return ptr;
+}
+
+inline void* operator new(size_t count) throw(std::bad_alloc) {
+  auto ptr = malloc(count);
+  IREE_TRACE_ALLOC(ptr, count);
+  return ptr;
+}
+
+inline void operator delete(void* ptr) noexcept {
+  IREE_TRACE_FREE(ptr);
+  free(ptr);
+}
 #endif  // __cplusplus && IREE_TRACING_FEATURE_ALLOCATION_TRACKING
 
 //===----------------------------------------------------------------------===//

diff --git a/iree/compiler/Codegen/BUILD b/iree/compiler/Codegen/BUILD
index f0ce273..ca89cc4 100644
--- a/iree/compiler/Codegen/BUILD
+++ b/iree/compiler/Codegen/BUILD

@@ -36,6 +36,7 @@
         ":PassesIncGen",
         "//iree/compiler/Codegen/Dialect:IREECodegenDialect",
         "//iree/compiler/Dialect/HAL/IR",
+        "//iree/compiler/Utils",
         "@llvm-project//mlir:LinalgTransforms",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Transforms",

diff --git a/iree/compiler/Codegen/CMakeLists.txt b/iree/compiler/Codegen/CMakeLists.txt
index ebd9500..7c91cf2 100644
--- a/iree/compiler/Codegen/CMakeLists.txt
+++ b/iree/compiler/Codegen/CMakeLists.txt

@@ -33,6 +33,7 @@
     MLIRTransforms
     iree::compiler::Codegen::Dialect::IREECodegenDialect
     iree::compiler::Dialect::HAL::IR
+    iree::compiler::Utils
   PUBLIC
 )
 

diff --git a/iree/compiler/Codegen/Common/BUILD b/iree/compiler/Codegen/Common/BUILD
index 6717bad..cb8e1fb 100644
--- a/iree/compiler/Codegen/Common/BUILD
+++ b/iree/compiler/Codegen/Common/BUILD

@@ -44,6 +44,7 @@
         "ForOpCanonicalizationPass.cpp",
         "IREEComprehensiveBufferizePass.cpp",
         "LinalgBufferizePass.cpp",
+        "MemrefCopyToLinalg.cpp",
         "OptimizeVectorTransferPass.cpp",
         "PolynomialApproximationPass.cpp",
         "RemoveTrivialLoops.cpp",
@@ -91,7 +92,6 @@
         "@llvm-project//mlir:MemRefTransforms",
         "@llvm-project//mlir:ModuleBufferization",
         "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:SCFBufferizableOpInterfaceImpl",
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:SCFTransforms",
         "@llvm-project//mlir:SideEffectInterfaces",
@@ -99,7 +99,7 @@
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:Transforms",
-        "@llvm-project//mlir:VectorBufferizableOpInterfaceImpl",
         "@llvm-project//mlir:VectorOps",
+        "@llvm-project//mlir:VectorTransforms",
     ],
 )

diff --git a/iree/compiler/Codegen/Common/BufferizationAnalysis.cpp b/iree/compiler/Codegen/Common/BufferizationAnalysis.cpp
index 8b58f38..17f9d5d 100644
--- a/iree/compiler/Codegen/Common/BufferizationAnalysis.cpp
+++ b/iree/compiler/Codegen/Common/BufferizationAnalysis.cpp

@@ -24,7 +24,7 @@
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Vector/VectorOps.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 
 #define DEBUG_TYPE "iree-codegen-bufferization-analysis"
@@ -585,7 +585,20 @@
             [&](scf::IfOp ifOp) { return analyseScfIfOp(ifOp, plan); })
         .Case<scf::ForOp>(
             [&](scf::ForOp forOp) { return analyseScfForOp(forOp, plan); })
-        .Default([&](Operation *op) { return success(); });
+        .Case<scf::YieldOp, linalg::InitTensorOp, tensor::DimOp,
+              tensor::ExtractOp, tensor::PadOp>(
+            [&](Operation *op) { return success(); })
+        .Default([&](Operation *op) -> LogicalResult {
+          if (llvm::any_of(op->getOperands(),
+                           [](Value v) {
+                             return v.getType().isa<RankedTensorType>();
+                           }) ||
+              llvm::any_of(op->getResultTypes(),
+                           [](Type t) { return t.isa<RankedTensorType>(); })) {
+            return op->emitOpError("unhandled tensor operation");
+          }
+          return success();
+        });
   };
   if (funcOp.walk<WalkOrder::PreOrder>(bufferMappingFn).wasInterrupted()) {
     return failure();

diff --git a/iree/compiler/Codegen/Common/CMakeLists.txt b/iree/compiler/Codegen/Common/CMakeLists.txt
index c5bcdc0..d5789de 100644
--- a/iree/compiler/Codegen/Common/CMakeLists.txt
+++ b/iree/compiler/Codegen/Common/CMakeLists.txt

@@ -35,6 +35,7 @@
     "ForOpCanonicalizationPass.cpp"
     "IREEComprehensiveBufferizePass.cpp"
     "LinalgBufferizePass.cpp"
+    "MemrefCopyToLinalg.cpp"
     "OptimizeVectorTransferPass.cpp"
     "PolynomialApproximationPass.cpp"
     "RemoveTrivialLoops.cpp"
@@ -67,7 +68,6 @@
     MLIRModuleBufferization
     MLIRPass
     MLIRSCF
-    MLIRSCFBufferizableOpInterfaceImpl
     MLIRSCFToStandard
     MLIRSCFTransforms
     MLIRSideEffectInterfaces
@@ -76,7 +76,7 @@
     MLIRTensor
     MLIRTransforms
     MLIRVector
-    MLIRVectorBufferizableOpInterfaceImpl
+    MLIRVectorTransforms
     iree::compiler::Codegen::Common::FoldTensorExtractOpIncGen
     iree::compiler::Codegen::Dialect::IREECodegenDialect
     iree::compiler::Codegen::Interfaces::BufferizationInterfaces

diff --git a/iree/compiler/Codegen/Common/ConvertToDestinationPassingStylePass.cpp b/iree/compiler/Codegen/Common/ConvertToDestinationPassingStylePass.cpp
index 4d4063f..f0d6fb7 100644
--- a/iree/compiler/Codegen/Common/ConvertToDestinationPassingStylePass.cpp
+++ b/iree/compiler/Codegen/Common/ConvertToDestinationPassingStylePass.cpp

@@ -32,7 +32,7 @@
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Vector/VectorOps.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/IR/Value.h"

diff --git a/iree/compiler/Codegen/Common/FlattenMemRefSubspanPass.cpp b/iree/compiler/Codegen/Common/FlattenMemRefSubspanPass.cpp
index ea2ad56..185c6e8 100644
--- a/iree/compiler/Codegen/Common/FlattenMemRefSubspanPass.cpp
+++ b/iree/compiler/Codegen/Common/FlattenMemRefSubspanPass.cpp

@@ -40,7 +40,7 @@
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/Dialect/Vector/VectorOps.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/BuiltinAttributes.h"

diff --git a/iree/compiler/Codegen/Common/ForOpCanonicalizationPass.cpp b/iree/compiler/Codegen/Common/ForOpCanonicalizationPass.cpp
index 298012f..47d9b7e 100644
--- a/iree/compiler/Codegen/Common/ForOpCanonicalizationPass.cpp
+++ b/iree/compiler/Codegen/Common/ForOpCanonicalizationPass.cpp

@@ -7,7 +7,7 @@
 #include "iree/compiler/Codegen/PassDetail.h"
 #include "iree/compiler/Codegen/Passes.h"
 #include "mlir/Dialect/SCF/SCF.h"
-#include "mlir/Dialect/Vector/VectorOps.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/BlockAndValueMapping.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"

diff --git a/iree/compiler/Codegen/Common/IREEComprehensiveBufferizePass.cpp b/iree/compiler/Codegen/Common/IREEComprehensiveBufferizePass.cpp
index 378a408..bf27427 100644
--- a/iree/compiler/Codegen/Common/IREEComprehensiveBufferizePass.cpp
+++ b/iree/compiler/Codegen/Common/IREEComprehensiveBufferizePass.cpp

@@ -39,7 +39,7 @@
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Vector/VectorOps.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/BlockAndValueMapping.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypes.h"
@@ -112,7 +112,8 @@
 // Default allocation functions.
 static FailureOr<Value> defaultAllocationFn(OpBuilder &builder, Location loc,
                                             MemRefType allocationType,
-                                            ValueRange dynamicSizes) {
+                                            ValueRange dynamicSizes,
+                                            unsigned int alignment) {
   return builder.create<memref::AllocOp>(loc, allocationType, dynamicSizes)
       .getResult();
 }
@@ -123,7 +124,7 @@
 }
 static LogicalResult defaultMemCpyFn(OpBuilder &builder, Location loc,
                                      Value from, Value to) {
-  builder.create<linalg::CopyOp>(loc, from, to);
+  createLinalgCopyOp(builder, loc, from, to);
   return success();
 }
 

diff --git a/iree/compiler/Codegen/Common/LinalgBufferizePass.cpp b/iree/compiler/Codegen/Common/LinalgBufferizePass.cpp
index 19aa7d7..9eed987 100644
--- a/iree/compiler/Codegen/Common/LinalgBufferizePass.cpp
+++ b/iree/compiler/Codegen/Common/LinalgBufferizePass.cpp

@@ -57,7 +57,7 @@
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Vector/VectorOps.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/IR/Value.h"
@@ -707,7 +707,7 @@
     Value outBuffer = bvm.lookupOrNull(outTensor);
     if (outBuffer && !plan.isEquivalent(outTensor, resultTensor) &&
         op.payloadUsesValueFromOperand(outOperand)) {
-      b.create<linalg::CopyOp>(loc, outBuffer, resultBuffer);
+      createLinalgCopyOp(b, loc, outBuffer, resultBuffer);
     }
     newOutputBuffers.push_back(resultBuffer);
   }
@@ -770,8 +770,7 @@
       b, storeOp.getLoc(), storeFrom.getType().cast<ShapedType>().getRank(),
       storeTo, storeOp.getMixedOffsets(), storeOp.getMixedSizes(),
       storeOp.getMixedStrides());
-
-  b.create<linalg::CopyOp>(storeOp->getLoc(), storeFrom, subview);
+  createLinalgCopyOp(b, storeOp->getLoc(), storeFrom, subview);
   return success();
 }
 
@@ -791,7 +790,7 @@
   Value dest = op.dest();
   if (!plan.isEquivalent(dest, result)) {
     Value destBuffer = bvm.lookup(dest);
-    b.create<linalg::CopyOp>(loc, destBuffer, resultBuffer);
+    createLinalgCopyOp(b, loc, destBuffer, resultBuffer);
   }
 
   Value source = op.source();
@@ -807,7 +806,7 @@
   SmallVector<OpFoldResult> strides = op.getMixedStrides();
   Value subViewOp = createSubviewOp(b, loc, sourceType.getRank(), resultBuffer,
                                     offsets, sizes, strides);
-  b.create<linalg::CopyOp>(loc, sourceBuffer, subViewOp);
+  createLinalgCopyOp(b, loc, sourceBuffer, subViewOp);
   return success();
 }
 
@@ -820,7 +819,7 @@
   Value resultBuffer = bvm.lookup(result);
   if (!plan.isEquivalent(op.dest(), result)) {
     Value destBuffer = bvm.lookup(op.dest());
-    b.create<linalg::CopyOp>(loc, destBuffer, resultBuffer);
+    createLinalgCopyOp(b, loc, destBuffer, resultBuffer);
   }
 
   b.create<memref::StoreOp>(loc, op.scalar(), resultBuffer, op.indices());
@@ -843,7 +842,7 @@
       // initial value and can avoid the copy.
       !op.source().getDefiningOp<linalg::InitTensorOp>()) {
     Value destBuffer = bvm.lookup(op.source());
-    b.create<linalg::CopyOp>(loc, destBuffer, resultBuffer);
+    createLinalgCopyOp(b, loc, destBuffer, resultBuffer);
   }
 
   // Create a new vector.transfer_write operation without a result value.
@@ -868,7 +867,7 @@
     bvm.map(yieldOperand, resultBuffer);
     if (!plan.isEquivalent(arg.value(), initOperand.get())) {
       Value initBuffer = bvm.lookup(initOperand.get());
-      b.create<linalg::CopyOp>(loc, initBuffer, resultBuffer);
+      createLinalgCopyOp(b, loc, initBuffer, resultBuffer);
     }
   }
   return success();
@@ -901,8 +900,8 @@
   for (auto result : enumerate(tiedResults)) {
     Value operand = tiedOperands[result.index()];
     if (!plan.isEquivalent(result.value(), operand)) {
-      b.create<linalg::CopyOp>(loc, aliasingBuffers[result.index()],
-                               bvm.lookup(result.value()));
+      createLinalgCopyOp(b, loc, aliasingBuffers[result.index()],
+                         bvm.lookup(result.value()));
     }
   }
 }
@@ -960,7 +959,7 @@
                                                    tensorPadOp.getMixedLowPad(),
                                                    sizeMixedValues, strides);
   // Copy to the interior region.
-  b.create<linalg::CopyOp>(loc, inputMemref, resultSubView);
+  createLinalgCopyOp(b, loc, inputMemref, resultSubView);
   return success();
 }
 

diff --git a/iree/compiler/Codegen/Common/MemrefCopyToLinalg.cpp b/iree/compiler/Codegen/Common/MemrefCopyToLinalg.cpp
new file mode 100644
index 0000000..75fafc8
--- /dev/null
+++ b/iree/compiler/Codegen/Common/MemrefCopyToLinalg.cpp

@@ -0,0 +1,55 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/compiler/Codegen/PassDetail.h"
+#include "iree/compiler/Codegen/Passes.h"
+#include "iree/compiler/Codegen/Utils/Utils.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace mlir {
+namespace iree_compiler {
+
+namespace {
+struct MemrefCopyOpToLinalg : public OpRewritePattern<memref::CopyOp> {
+  using OpRewritePattern<memref::CopyOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(memref::CopyOp copyOp,
+                                PatternRewriter &rewriter) const override {
+    Operation *linalgCopy = createLinalgCopyOp(
+        rewriter, copyOp.getLoc(), copyOp.source(), copyOp.target());
+    rewriter.replaceOp(copyOp, linalgCopy->getResults());
+    return success();
+  }
+};
+
+struct MemrefCopyToLinalgPass
+    : public MemrefCopyToLinalgPassBase<MemrefCopyToLinalgPass> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<linalg::LinalgDialect>();
+  }
+
+  void runOnOperation() override {
+    MLIRContext *context = &getContext();
+    RewritePatternSet patterns(&getContext());
+    patterns.insert<MemrefCopyOpToLinalg>(context);
+    if (failed(applyPatternsAndFoldGreedily(getOperation(),
+                                            std::move(patterns)))) {
+      return signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<FuncOp>> createMemrefCopyToLinalgPass() {
+  return std::make_unique<MemrefCopyToLinalgPass>();
+}
+
+}  // namespace iree_compiler
+}  // namespace mlir

diff --git a/iree/compiler/Codegen/Common/OptimizeVectorTransferPass.cpp b/iree/compiler/Codegen/Common/OptimizeVectorTransferPass.cpp
index c958752..0d78039 100644
--- a/iree/compiler/Codegen/Common/OptimizeVectorTransferPass.cpp
+++ b/iree/compiler/Codegen/Common/OptimizeVectorTransferPass.cpp

@@ -10,8 +10,8 @@
 #include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/Dialect/Vector/VectorOps.h"
-#include "mlir/Dialect/Vector/VectorTransforms.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 

diff --git a/iree/compiler/Codegen/Common/VectorizeConv.cpp b/iree/compiler/Codegen/Common/VectorizeConv.cpp
index cb2c10e..2a1bcf1 100644
--- a/iree/compiler/Codegen/Common/VectorizeConv.cpp
+++ b/iree/compiler/Codegen/Common/VectorizeConv.cpp

@@ -13,7 +13,7 @@
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Dialect/Utils/StructuredOpsUtils.h"
-#include "mlir/Dialect/Vector/VectorOps.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/PatternMatch.h"

diff --git a/iree/compiler/Codegen/Common/VectorizeMMT4d.cpp b/iree/compiler/Codegen/Common/VectorizeMMT4d.cpp
index 715310a..db37a36 100644
--- a/iree/compiler/Codegen/Common/VectorizeMMT4d.cpp
+++ b/iree/compiler/Codegen/Common/VectorizeMMT4d.cpp

@@ -7,7 +7,7 @@
 #include "iree/compiler/Codegen/PassDetail.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Vector/VectorOps.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir {

diff --git a/iree/compiler/Codegen/Common/test/iree_comprehensive_bufferize.mlir b/iree/compiler/Codegen/Common/test/iree_comprehensive_bufferize.mlir
index df902a7..f6ecd57 100644
--- a/iree/compiler/Codegen/Common/test/iree_comprehensive_bufferize.mlir
+++ b/iree/compiler/Codegen/Common/test/iree_comprehensive_bufferize.mlir

@@ -62,12 +62,12 @@
 //  CHECK-DAG:       %[[RHS_TILE:.+]] = memref.subview %[[RHS]][0, %[[IV1]]] [%[[K]], %[[TILESIZE_X]]]
 //  CHECK-DAG:       %[[INIT_TILE:.+]] = memref.subview %[[INIT]][%[[IV0]], %[[IV1]]] [%[[TILESIZE_Y]], %[[TILESIZE_X]]]
 //      CHECK:       %[[ALLOC:.+]] = memref.alloc(%[[TILESIZE_Y]], %[[TILESIZE_X]])
-//      CHECK:       linalg.copy(%[[INIT_TILE]], %[[ALLOC]])
+//      CHECK:       linalg.generic {{.*}} ins(%[[INIT_TILE]] {{.*}} outs(%[[ALLOC]]
 //      CHECK:       linalg.matmul
 // CHECK-SAME:           ins(%[[LHS_TILE]], %[[RHS_TILE]]
 // CHECK-SAME:           outs(%[[ALLOC]]
 //      CHECK:       %[[RESULT_TILE:.+]] = memref.subview %[[RESULT]][%[[IV0]], %[[IV1]]] [%[[TILESIZE_Y]], %[[TILESIZE_X]]]
-//      CHECK:       linalg.copy(%[[ALLOC]], %[[RESULT_TILE]])
+//      CHECK:       linalg.generic {{.*}} ins(%[[ALLOC]] {{.*}} outs(%[[RESULT_TILE]]
 //      CHECK:       memref.dealloc %[[ALLOC]]
 
 
@@ -130,7 +130,7 @@
 //      CHECK:   scf.for %[[IV0:.+]] = %[[OFFSET_Y]] to %[[M]] step %[[STEP_Y]]
 //      CHECK:     %[[TILESIZE_Y:.+]] = affine.min #[[MAP1]](%[[IV0]])[%[[WG_SIZE_Y]], %[[M]]]
 //      CHECK:     scf.for %[[IV1:.+]] = %[[OFFSET_X]] to %[[N]] step %[[STEP_X]]
-//  CHECK-NOT:       memref.copy
+//  CHECK-NOT:       linalg.generic
 //      CHECK:       %[[TILESIZE_X:.+]] = affine.min #[[MAP1]](%[[IV1]])[%[[WG_SIZE_X]], %[[N]]]
 //  CHECK-DAG:       %[[LHS_TILE:.+]] = memref.subview %[[LHS]][%[[IV0]], 0] [%[[TILESIZE_Y]], %[[K]]]
 //  CHECK-DAG:       %[[RHS_TILE:.+]] = memref.subview %[[RHS]][0, %[[IV1]]] [%[[K]], %[[TILESIZE_X]]]

diff --git a/iree/compiler/Codegen/Common/test/linalg_bufferize.mlir b/iree/compiler/Codegen/Common/test/linalg_bufferize.mlir
index 773c778..ac78d86 100644
--- a/iree/compiler/Codegen/Common/test/linalg_bufferize.mlir
+++ b/iree/compiler/Codegen/Common/test/linalg_bufferize.mlir

@@ -1,4 +1,4 @@
-// RUN: iree-opt %s --iree-codegen-linalg-bufferize -canonicalize -cse -split-input-file | FileCheck %s
+// RUN: iree-opt %s --iree-codegen-linalg-bufferize -canonicalize -cse -split-input-file  -verify-diagnostics | FileCheck %s
 
 func @tile_from_tensor_load() {
   %c0 = arith.constant 0 : index
@@ -39,7 +39,7 @@
 //   CHECK-DAG:       %[[RHS:.+]] = memref.subview %[[TENSOR_RHS]][0, %[[IV1]]] [3, 1] [1, 1]
 //   CHECK-DAG:       %[[INIT:.+]] = memref.subview %[[TENSOR_INIT]][%[[IV0]], %[[IV1]]] [1, 1] [1, 1]
 //   CHECK-DAG:       %[[RESULT:.+]] = memref.subview %[[RETURN]][%[[IV0]], %[[IV1]]] [1, 1] [1, 1]
-//       CHECK:       linalg.copy(%[[INIT]], %[[RESULT]])
+//       CHECK:       linalg.generic {{.*}} ins(%[[INIT]] {{.*}} outs(%[[RESULT]]
 //       CHECK:       linalg.matmul
 //  CHECK-SAME:         ins(%[[LHS]], %[[RHS]]
 //  CHECK-SAME:         outs(%[[RESULT]]
@@ -129,7 +129,7 @@
 //  CHECK-SAME:         ins(%[[LHS]], %[[RHS]]
 //  CHECK-SAME:         outs(%[[RESULT1]]
 //       CHECK:       %[[RESULT2:.+]] = memref.subview %[[RETURN2]][%[[IV0]], %[[IV1]]] [1, 1] [1, 1]
-//       CHECK:       linalg.copy(%[[RESULT1]], %[[RESULT2]])
+//       CHECK:       linalg.generic {{.*}} ins(%[[RESULT1]] {{.*}} outs(%[[RESULT2]]
 
 // -----
 
@@ -182,7 +182,7 @@
 //  CHECK-SAME:         outs(%[[ALLOC]]
 //   CHECK-DAG:       %[[INIT:.+]] = memref.subview %[[TENSOR_INIT]][%[[IV0]], %[[IV1]]] [1, 1] [1, 1]
 //   CHECK-DAG:       %[[RESULT:.+]] = memref.subview %[[RETURN]][%[[IV0]], %[[IV1]]] [1, 1] [1, 1]
-//       CHECK:       linalg.copy(%[[INIT]], %[[RESULT]])
+//       CHECK:       linalg.generic {{.*}} ins(%[[INIT]] {{.*}} outs(%[[RESULT]]
 //       CHECK:       linalg.matmul
 //  CHECK-SAME:         ins(%[[ALLOC]], %[[RHS]]
 //  CHECK-SAME:         outs(%[[RESULT]]
@@ -388,7 +388,7 @@
 //   CHECK-DAG:       %[[RHS:.+]] = memref.subview %[[TENSOR_RHS]][0, %[[IV1]]] [3, 1] [1, 1]
 //   CHECK-DAG:       %[[INIT:.+]] = memref.subview %[[TENSOR_INIT]][%[[IV0]], %[[IV1]]] [1, 1] [1, 1]
 //   CHECK-DAG:       %[[RESULT:.+]] = memref.subview %[[RETURN]][%[[IV0]], %[[IV1]]] [1, 1] [1, 1]
-//       CHECK:       linalg.copy(%[[INIT]], %[[RESULT]])
+//       CHECK:       linalg.generic {{.*}} ins(%[[INIT]] {{.*}} outs(%[[RESULT]]
 //       CHECK:       linalg.matmul
 //  CHECK-SAME:         outs(%[[RESULT]]
 //       CHECK:       linalg.matmul
@@ -509,7 +509,7 @@
 //       CHECK:       %[[TILE_N_2:.+]] = affine.min #[[MAP2]](%[[IV1]])[%[[WGSIZE_X]], %[[DIM5]]]
 //   CHECK-DAG:       %[[INIT_TILE:.+]] = memref.subview %[[INIT]][%[[IV0]], %[[IV1]]] [%[[TILE_M_2]], %[[TILE_N_2]]]
 //   CHECK-DAG:       %[[RESULT_TILE:.+]] = memref.subview %[[RESULT]][%[[IV0]], %[[IV1]]] [%[[TILE_M_2]], %[[TILE_N_2]]]
-//       CHECK:       linalg.copy(%[[INIT_TILE]], %[[RESULT_TILE]])
+//       CHECK:       linalg.generic {{.*}} ins(%[[INIT_TILE]] {{.*}} outs(%[[RESULT_TILE]]
 //       CHECK:       linalg.matmul
 //  CHECK-SAME:         ins(%[[LHS_TILE]], %[[RHS_TILE]]
 //  CHECK-SAME:         outs(%[[RESULT_TILE]]
@@ -601,7 +601,7 @@
 //   CHECK-DAG:   %[[ARG0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
 //   CHECK-DAG:   %[[RET0:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
 //       CHECK:   %[[RESHAPE:.+]] = memref.expand_shape %[[ARG0]] {{\[}}[0, 1]]
-//       CHECK:   linalg.copy(%[[RESHAPE]], %[[RET0]])
+//       CHECK:   linalg.generic {{.*}} ins(%[[RESHAPE]] {{.*}} outs(%[[RET0]]
 
 // -----
 
@@ -671,7 +671,7 @@
 //       CHECK:   linalg.generic
 //  CHECK-SAME:     ins(%[[RESHAPE]] : memref<3x4xi32>)
 //  CHECK-SAME:     outs(%[[RET0]] : memref<3x4xi32>)
-//       CHECK:   linalg.copy(%[[RESHAPE]], %[[RET1]])
+//       CHECK:   linalg.generic {{.*}} ins(%[[RESHAPE]] {{.*}} outs(%[[RET1]]
 
 // -----
 
@@ -779,7 +779,7 @@
 //   CHECK-DAG: %[[ARG:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
 //   CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
 //       CHECK: %[[SUBVIEW:.+]] = memref.subview %[[ARG]]
-//       CHECK: linalg.copy(%[[SUBVIEW]], %[[RETURN]])
+//       CHECK: linalg.generic {{.*}} ins(%[[SUBVIEW]] {{.*}} outs(%[[RETURN]]
 
 // -----
 
@@ -802,7 +802,7 @@
 //   CHECK-DAG: %[[ARG:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
 //   CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
 //       CHECK: %[[SUBVIEW:.+]] = memref.subview %[[ARG]]
-//       CHECK: linalg.copy(%[[SUBVIEW]], %[[RETURN]])
+//       CHECK: linalg.generic {{.*}} ins(%[[SUBVIEW]] {{.*}} outs(%[[RETURN]]
 
 // -----
 
@@ -834,10 +834,10 @@
 //   CHECK-DAG: %[[SIZE2:.+]] = hal.interface.constant.load[4] : index
 //   CHECK-DAG: %[[SIZE3:.+]] = hal.interface.constant.load[5] : index
 //       CHECK: %[[SUBVIEW1:.+]] = memref.subview %[[ARG]][%{{.+}}, %{{.+}}, %{{.+}}] [%[[SIZE1]], %[[SIZE2]], %[[SIZE3]]]
-//       CHECK: linalg.copy(%[[SUBVIEW1]], %[[RETURN1]])
+//       CHECK: linalg.generic {{.*}} ins(%[[SUBVIEW1]] {{.*}} outs(%[[RETURN1]]
 //   CHECK-DAG: %[[SUBVIEW2:.+]] = memref.subview %[[ARG]][%{{.+}}, %{{.+}}, %{{.+}}] [%[[SIZE1]], 1, %[[SIZE3]]]
 //   CHECK-DAG: %[[RETURNVIEW:.+]] = memref.subview %[[RETURN2]]
-//       CHECK: linalg.copy(%[[SUBVIEW2]], %[[RETURNVIEW]])
+//       CHECK: linalg.generic {{.*}} ins(%[[SUBVIEW2]] {{.*}} outs(%[[RETURNVIEW]]
 
 // -----
 
@@ -852,7 +852,7 @@
 }
 
 // CHECK-LABEL: func @slice_in_place()
-//   CHECK-NOT:   linalg.copy
+//   CHECK-NOT:   linalg.generic
 
 
 // -----
@@ -876,7 +876,7 @@
 //   CHECK-DAG:   %[[OUTPUT:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
 //   CHECK-DAG:   %[[SUBVIEW_INPUT:.+]] = memref.subview %[[INPUT]]
 //   CHECK-DAG:   %[[SUBVIEW_OUTPUT:.+]] = memref.subview %[[OUTPUT]]
-//       CHECK:   linalg.copy(%[[SUBVIEW_INPUT]], %[[SUBVIEW_OUTPUT]])
+//       CHECK:   linalg.generic {{.*}} ins(%[[SUBVIEW_INPUT]] {{.*}} outs(%[[SUBVIEW_OUTPUT]]
 
 // -----
 
@@ -907,9 +907,9 @@
 //   CHECK-DAG:   %[[RET0:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer)
 //   CHECK-DAG:   %[[D0:.+]] = hal.interface.constant.load[0] : index
 //   CHECK-DAG:   %[[D1:.+]] = hal.interface.constant.load[1] : index
-//       CHECK:   linalg.copy(%[[ARG1]], %[[RET0]])
+//       CHECK:   linalg.generic {{.*}} ins(%[[ARG1]] {{.*}} outs(%[[RET0]]
 //       CHECK:   %[[SUBVIEW:.+]] = memref.subview %[[RET0]][3, 4] [%[[D0]], %[[D1]]] [1, 1]
-//       CHECK:   linalg.copy(%[[ARG0]], %[[SUBVIEW]])
+//       CHECK:   linalg.generic {{.*}} ins(%[[ARG0]] {{.*}} outs(%[[SUBVIEW]]
 
 // -----
 
@@ -945,7 +945,7 @@
 // CHECK-LABEL: func @load_to_store()
 //       CHECK:   %[[OUT:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<3x4xi32>
 //       CHECK:   %[[IN:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<3x4xi32>
-//       CHECK:   linalg.copy(%[[IN]], %[[OUT]]) : memref<3x4xi32>, memref<3x4xi32>
+//       CHECK:   linalg.generic {{.*}} ins(%[[IN]] {{.*}} outs(%[[OUT]]
 
 // -----
 
@@ -961,7 +961,7 @@
 //       CHECK:   %[[CST:.+]] = arith.constant {{.+}} : tensor<2x2x3xi32>
 //       CHECK:   %[[MEMREF:.+]] = bufferization.to_memref %[[CST]] : memref<2x2x3xi32>
 //       CHECK:   %[[RESULT:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
-//       CHECK:   linalg.copy(%[[MEMREF]], %[[RESULT]])
+//       CHECK:   linalg.generic {{.*}} ins(%[[MEMREF]] {{.*}} outs(%[[RESULT]]
 
 // -----
 
@@ -1332,7 +1332,7 @@
 //       CHECK: %[[CAST5:.+]] = bufferization.to_memref %[[CST5]] : memref<5xi32>
 //       CHECK: %[[INPUT:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<5xf32>
 //       CHECK: %[[OUTPUT:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<i32>
-//       CHECK: linalg.copy(%[[CAST1]], %[[OUTPUT]])
+//       CHECK: linalg.generic {{.*}} ins(%[[CAST1]] {{.*}} outs(%[[OUTPUT]]
 //       CHECK: linalg.generic
 //  CHECK-SAME:   ins(%[[INPUT]], %[[CAST5]] : memref<5xf32>, memref<5xi32>)
 //  CHECK-SAME:   outs(%[[OUTPUT]] : memref<i32>)
@@ -1412,7 +1412,7 @@
 //   CHECK-DAG:   %[[ARG:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
 //   CHECK-DAG:   %[[RET:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
 //       CHECK:   %[[SUBVIEW:.+]] = memref.subview %[[RET]]
-//       CHECK:   linalg.copy(%[[ARG]], %[[SUBVIEW]])
+//       CHECK:   linalg.generic {{.*}} ins(%[[ARG]] {{.*}} outs(%[[SUBVIEW]]
 
 // -----
 
@@ -1467,7 +1467,7 @@
 // CHECK-COUNT-3:   vector.transfer_read %[[ARG1V]]
 // CHECK-COUNT-2:   vector.transfer_read %[[ARG2V]]
 //         CHECK:   %[[RET0V:.+]] = memref.subview %[[RET0]]
-//         CHECK:   linalg.copy(%[[ARG2V]], %[[RET0V]])
+//         CHECK:   linalg.generic {{.*}} ins(%[[ARG2V]] {{.*}} outs(%[[RET0V]]
 //         CHECK:   vector.transfer_write %{{.+}}, %[[RET0V]]
 //         CHECK:   vector.transfer_write %{{.+}}, %[[RET0V]]
 
@@ -1521,7 +1521,7 @@
 // CHECK-COUNT-6:   vector.transfer_read %[[ARG0V]]
 // CHECK-COUNT-3:   vector.transfer_read %[[ARG1V]]
 // CHECK-COUNT-2:   vector.transfer_read %[[RET0V]]
-//     CHECK-NOT:   linalg.copy
+//     CHECK-NOT:   linalg.generic
 //         CHECK:   vector.transfer_write %{{.+}}, %[[RET0V]]
 //         CHECK:   vector.transfer_write %{{.+}}, %[[RET0V]]
 
@@ -1651,10 +1651,10 @@
 //     CHECK: linalg.fill(%[[C0]], %[[DST_V]])
 //     CHECK: linalg.fill(%[[C0]], %[[LHS_PADDED]]) : f32, memref<64x32xf32>
 //     CHECK: %[[LHS_PADDED_INTER:.+]] = memref.subview %[[LHS_PADDED]][0, 0] [64, 27] [1, 1]
-//     CHECK: linalg.copy(%[[LHS_V]], %[[LHS_PADDED_INTER]])
+//     CHECK: linalg.generic {{.*}} ins(%[[LHS_V]] {{.*}} outs(%[[LHS_PADDED_INTER]]
 //     CHECK: linalg.fill(%[[C0]], %[[RHS_PADDED]]) : f32, memref<32x16xf32>
 //     CHECK: %[[RHS_PADDED_INTER:.+]] = memref.subview %[[RHS_PADDED]][0, 0] [27, 16] [1, 1]
-//     CHECK: linalg.copy(%[[RHS_V]], %[[RHS_PADDED_INTER]])
+//     CHECK: linalg.generic {{.*}} ins(%[[RHS_V]] {{.*}} outs(%[[RHS_PADDED_INTER]]
 //     CHECK: linalg.matmul ins(%[[LHS_PADDED]], %[[RHS_PADDED]] : memref<64x32xf32>, memref<32x16xf32>)
 
 // -----
@@ -1722,16 +1722,16 @@
 //   CHECK-DAG:      %[[ARG1_SV:.+]] = memref.subview %[[ARG1]]
 //       CHECK:       linalg.fill(%{{.*}}, %[[ALLOC_ARG0]]
 //       CHECK:      %[[ALLOC_ARG0_SV:.+]] = memref.subview %[[ALLOC_ARG0]]
-//       CHECK:       linalg.copy(%[[ARG0_SV]], %[[ALLOC_ARG0_SV]])
+//       CHECK:      linalg.generic {{.*}} ins(%[[ARG0_SV]] {{.*}} outs(%[[ALLOC_ARG0_SV]]
 //       CHECK:      linalg.fill(%{{.*}}, %[[ALLOC_ARG1]]
-//       CHECK:      linalg.copy(%[[ARG1_SV]]
+//       CHECK:      linalg.generic {{.*}} ins(%[[ARG1_SV]]
 //       CHECK:      linalg.fill(%{{.*}}, %[[ALLOC_RET0]]
 //       CHECK:      linalg.matmul
 //  CHECK-SAME:        ins(%[[ALLOC_ARG0]], %[[ALLOC_ARG1]]
 //  CHECK-SAME:        outs(%[[ALLOC_RET0]]
 //   CHECK-DAG:      %[[RET0_SV:.+]] = memref.subview %[[RET0]]
 //   CHECK-DAG:      %[[ALLOC_RET0_SV:.+]] = memref.subview
-//       CHECK:      linalg.copy(%[[ALLOC_RET0_SV]], %[[RET0_SV]])
+//       CHECK:      linalg.generic {{.*}} ins(%[[ALLOC_RET0_SV]] {{.*}} outs(%[[RET0_SV]]
 
 // -----
 
@@ -1875,7 +1875,7 @@
 //    CHECK-DAG:        %[[RHS_WORKGROUP_TILE:.+]] = memref.subview %[[RHS]][0, %[[WORKGROUP_J]]] [144, %[[WORKGROUP_J_SIZE]]] [1, 1] : memref<144x370xf32> to memref<144x?xf32
 //    CHECK-DAG:            %[[INIT_WORKGROUP_TILE:.+]] = memref.subview %[[INIT]][%[[WORKGROUP_I]], %[[WORKGROUP_J]]] [%[[WORKGROUP_I_SIZE]], %[[WORKGROUP_J_SIZE]]]
 //    CHECK-DAG:            %[[DST_WORKGROUP_TILE:.+]] = memref.subview %[[DST]][%[[WORKGROUP_I]], %[[WORKGROUP_J]]] [%[[WORKGROUP_I_SIZE]], %[[WORKGROUP_J_SIZE]]]
-//        CHECK:            linalg.copy(%[[INIT_WORKGROUP_TILE]], %[[DST_WORKGROUP_TILE]])
+//        CHECK:            linalg.generic {{.*}} ins(%[[INIT_WORKGROUP_TILE]] {{.*}} outs(%[[DST_WORKGROUP_TILE]]
 //        CHECK:            scf.for %[[L1_I:.+]] = %{{.*}} to %[[M]] step %[[L1_MN_SIZE]] {
 //        CHECK:              scf.for %[[L1_J:.+]] = %{{.*}} to %[[N]] step %[[L1_MN_SIZE]] {
 //        CHECK:                scf.for %[[L1_K:.+]] = %{{.*}} to %[[K]] step %[[L1_K_SIZE]] {
@@ -2110,7 +2110,7 @@
 // CHECK-LABEL: func @sort1D()
 //   CHECK-DAG:   %[[INPUT:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
 //   CHECK-DAG:   %[[OUTPUT:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
-//       CHECK:   linalg.copy(%[[INPUT]], %[[OUTPUT]])
+//       CHECK:   linalg.generic {{.*}} ins(%[[INPUT]] {{.*}} outs(%[[OUTPUT]]
 //       CHECK:   scf.for %[[ARG0:.+]] =
 //       CHECK:     scf.for %[[ARG1:.+]] =
 //   CHECK-DAG:       %[[P1:.+]] = arith.addi %[[ARG1]]
@@ -2229,7 +2229,7 @@
 //   CHECK-DAG:       %[[DST_IDX_Y:.+]] = affine.apply #[[MAP]](%[[IV0]])[%[[OFFSET_Y]]]
 //   CHECK-DAG:       %[[DST_IDX_X:.+]] = affine.apply #[[MAP]](%[[IV1]])[%[[OFFSET_X]]]
 //       CHECK:       %[[DST_VIEW:.+]] = memref.subview %[[DST]][%[[DST_IDX_Y]], %[[DST_IDX_X]]]
-//       CHECK:       linalg.copy(%[[SRC_VIEW]], %[[DST_VIEW]])
+//       CHECK:       linalg.generic {{.*}} ins(%[[SRC_VIEW]] {{.*}} outs(%[[DST_VIEW]]
 
 
 // -----
@@ -2273,7 +2273,7 @@
 //  CHECK-SAME:         : memref<?xi32> to memref<?xi32, #{{.+}}>
 //       CHECK:     %[[DST_VIEW:.+]] = memref.subview %[[DST]][0, %{{.+}}] [1, %{{.+}}]
 //  CHECK-SAME:         : memref<?x?xi32> to memref<?xi32, #{{.+}}>
-//       CHECK:     linalg.copy(%[[SRC_VIEW]], %[[DST_VIEW]])
+//       CHECK:     linalg.generic {{.*}} ins(%[[SRC_VIEW]] {{.*}} outs(%[[DST_VIEW]]
 
 // -----
 
@@ -2639,3 +2639,18 @@
 }
 // CHECK-LABEL: func @dot_general_nontrivial_batching_mutliple_parallel_dimension()
 //   CHECK-NOT:   memref.alloc
+
+// -----
+
+func @dispatch() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(32) : !flow.dispatch.tensor<readonly:4xf32>
+  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(32) : !flow.dispatch.tensor<readonly:4xf32>
+  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(32) : !flow.dispatch.tensor<writeonly:4xf32>
+  %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<?xf32>
+  %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<?xf32>
+  // expected-error @+1 {{unhandled tensor operation}}
+  %5 = arith.mulf %3, %4 : tensor<?xf32>
+  flow.dispatch.tensor.store %5, %2, offsets = [0], sizes = [4], strides = [1] : tensor<?xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
+  return
+}

diff --git a/iree/compiler/Codegen/Dialect/LoweringConfig.cpp b/iree/compiler/Codegen/Dialect/LoweringConfig.cpp
index 7548b46..ddd7575 100644
--- a/iree/compiler/Codegen/Dialect/LoweringConfig.cpp
+++ b/iree/compiler/Codegen/Dialect/LoweringConfig.cpp

@@ -334,7 +334,9 @@
     IREE::Codegen::LoweringConfigAttr config,
     IREE::Codegen::DispatchLoweringPassPipeline passPipeline,
     ArrayRef<int64_t> workgroupSize) {
-  auto partitionedLoops = getPartitionedLoops(op);
+  auto interfaceOp = cast<IREE::Flow::PartitionableLoopsInterface>(*op);
+  auto partitionedLoops =
+      interfaceOp.getPartitionableLoops(kNumMaxParallelDims);
   SmallVector<int64_t, 3> workloadPerWorkgroup;
   auto tileSizes = config.getTileSizeVals(0);
   if (!tileSizes.empty() && !partitionedLoops.empty()) {

diff --git a/iree/compiler/Codegen/Interfaces/BUILD b/iree/compiler/Codegen/Interfaces/BUILD
index 1a1fa95..f7d8e76 100644
--- a/iree/compiler/Codegen/Interfaces/BUILD
+++ b/iree/compiler/Codegen/Interfaces/BUILD

@@ -62,11 +62,11 @@
         "@llvm-project//mlir:LinalgBufferizableOpInterfaceImpl",
         "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:ModuleBufferization",
-        "@llvm-project//mlir:SCFBufferizableOpInterfaceImpl",
-        "@llvm-project//mlir:StdBufferizableOpInterfaceImpl",
+        "@llvm-project//mlir:SCFTransforms",
+        "@llvm-project//mlir:StandardOpsTransforms",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorTransforms",
-        "@llvm-project//mlir:VectorBufferizableOpInterfaceImpl",
+        "@llvm-project//mlir:VectorTransforms",
     ],
 )
 

diff --git a/iree/compiler/Codegen/Interfaces/BufferizationInterfaces.cpp b/iree/compiler/Codegen/Interfaces/BufferizationInterfaces.cpp
index 8f71296..06b9a26 100644
--- a/iree/compiler/Codegen/Interfaces/BufferizationInterfaces.cpp
+++ b/iree/compiler/Codegen/Interfaces/BufferizationInterfaces.cpp

@@ -17,11 +17,11 @@
 #include "mlir/Dialect/Linalg/ComprehensiveBufferize/AffineInterfaceImpl.h"
 #include "mlir/Dialect/Linalg/ComprehensiveBufferize/LinalgInterfaceImpl.h"
 #include "mlir/Dialect/Linalg/ComprehensiveBufferize/ModuleBufferization.h"
-#include "mlir/Dialect/Linalg/ComprehensiveBufferize/SCFInterfaceImpl.h"
-#include "mlir/Dialect/Linalg/ComprehensiveBufferize/StdInterfaceImpl.h"
-#include "mlir/Dialect/Linalg/ComprehensiveBufferize/VectorInterfaceImpl.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/BufferizableOpInterfaceImpl.h"
+#include "mlir/Dialect/StandardOps/Transforms/BufferizableOpInterfaceImpl.h"
 #include "mlir/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.h"
+#include "mlir/Dialect/Vector/Transforms/BufferizableOpInterfaceImpl.h"
 #include "mlir/Support/LLVM.h"
 
 using mlir::bufferization::AnalysisBufferizationOptions;
@@ -212,7 +212,7 @@
     return eliminateInitTensors(
         op, state, aliasInfo,
         /*anchorMatchFunc=*/
-        [&](OpOperand &operand) {
+        [&](OpOperand &operand, SmallVector<Value> &) {
           return isa<IREE::Flow::DispatchTensorStoreOp>(operand.getOwner());
         },
         /*rewriteFunc=*/
@@ -284,15 +284,12 @@
   arith::registerBufferizableOpInterfaceExternalModels(registry);
   linalg::comprehensive_bufferize::linalg_ext::
       registerBufferizableOpInterfaceExternalModels(registry);
-  linalg::comprehensive_bufferize::scf_ext::
-      registerBufferizableOpInterfaceExternalModels(registry);
+  scf::registerBufferizableOpInterfaceExternalModels(registry);
   linalg::comprehensive_bufferize::std_ext::
       registerModuleBufferizationExternalModels(registry);
-  linalg::comprehensive_bufferize::std_ext::
-      registerBufferizableOpInterfaceExternalModels(registry);
+  registerBufferizableOpInterfaceExternalModels(registry);
   tensor::registerBufferizableOpInterfaceExternalModels(registry);
-  linalg::comprehensive_bufferize::vector_ext::
-      registerBufferizableOpInterfaceExternalModels(registry);
+  vector::registerBufferizableOpInterfaceExternalModels(registry);
 
   // Register IREE operations.
   registry.addOpInterface<IREE::Flow::DispatchTensorLoadOp,

diff --git a/iree/compiler/Codegen/Interfaces/CMakeLists.txt b/iree/compiler/Codegen/Interfaces/CMakeLists.txt
index 87d7d88..5be69ea 100644
--- a/iree/compiler/Codegen/Interfaces/CMakeLists.txt
+++ b/iree/compiler/Codegen/Interfaces/CMakeLists.txt

@@ -38,11 +38,11 @@
     MLIRLinalgBufferizableOpInterfaceImpl
     MLIRMemRef
     MLIRModuleBufferization
-    MLIRSCFBufferizableOpInterfaceImpl
-    MLIRStdBufferizableOpInterfaceImpl
+    MLIRSCFTransforms
+    MLIRStandardOpsTransforms
     MLIRSupport
     MLIRTensorTransforms
-    MLIRVectorBufferizableOpInterfaceImpl
+    MLIRVectorTransforms
     iree::compiler::Dialect::Flow::IR
     iree::compiler::Dialect::HAL::IR
   PUBLIC

diff --git a/iree/compiler/Codegen/LLVMCPU/BUILD b/iree/compiler/Codegen/LLVMCPU/BUILD
index 4d61f31..f673b5a 100644
--- a/iree/compiler/Codegen/LLVMCPU/BUILD
+++ b/iree/compiler/Codegen/LLVMCPU/BUILD

@@ -34,9 +34,12 @@
         "//iree/compiler/Codegen/Transforms",
         "//iree/compiler/Codegen/Utils",
         "//iree/compiler/Dialect/Flow/IR",
+        "//iree/compiler/Dialect/Flow/IR:PartitionableLoopsInterface",
         "//iree/compiler/Dialect/HAL/IR",
         "//iree/compiler/Dialect/HAL/IR:HALDialect",
+        "//iree/compiler/Dialect/HAL/Utils",
         "//iree/compiler/Dialect/Util/IR",
+        "//iree/compiler/Utils",
         "//llvm-external-projects/iree-dialects:IREELinalgExtDialect",
         "//llvm-external-projects/iree-dialects:IREELinalgExtTransforms",
         "@llvm-project//llvm:Support",
@@ -75,5 +78,6 @@
         "@llvm-project//mlir:VectorOps",
         "@llvm-project//mlir:VectorToLLVM",
         "@llvm-project//mlir:VectorToSCF",
+        "@llvm-project//mlir:VectorTransforms",
     ],
 )

diff --git a/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt b/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt
index bc801a5..9704ec1 100644
--- a/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt
+++ b/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt

@@ -62,6 +62,7 @@
     MLIRVector
     MLIRVectorToLLVM
     MLIRVectorToSCF
+    MLIRVectorTransforms
     iree::compiler::Codegen::Common
     iree::compiler::Codegen::Dialect::IREECodegenDialect
     iree::compiler::Codegen::PassHeaders
@@ -69,9 +70,12 @@
     iree::compiler::Codegen::Transforms
     iree::compiler::Codegen::Utils
     iree::compiler::Dialect::Flow::IR
+    iree::compiler::Dialect::Flow::IR::PartitionableLoopsInterface
     iree::compiler::Dialect::HAL::IR
     iree::compiler::Dialect::HAL::IR::HALDialect
+    iree::compiler::Dialect::HAL::Utils
     iree::compiler::Dialect::Util::IR
+    iree::compiler::Utils
   PUBLIC
 )
 

diff --git a/iree/compiler/Codegen/LLVMCPU/ConvertToLLVM.cpp b/iree/compiler/Codegen/LLVMCPU/ConvertToLLVM.cpp
index 1e752d2..ef1d325 100644
--- a/iree/compiler/Codegen/LLVMCPU/ConvertToLLVM.cpp
+++ b/iree/compiler/Codegen/LLVMCPU/ConvertToLLVM.cpp

@@ -39,7 +39,7 @@
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Dialect/StandardOps/Transforms/Passes.h"
 #include "mlir/Dialect/Tosa/IR/TosaOps.h"
-#include "mlir/Dialect/Vector/VectorOps.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/Pass/Pass.h"

diff --git a/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index e4fe3bc..ac78c8e 100644
--- a/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp

@@ -11,7 +11,6 @@
 #include "iree/compiler/Codegen/Utils/MarkerUtils.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
 #include "iree/compiler/Dialect/Flow/IR/FlowOps.h"
-#include "llvm/ADT/Triple.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/TargetSelect.h"
@@ -72,29 +71,6 @@
   return targetAttr && targetAttr.getBackend().getValue() == "vmvx";
 }
 
-static Optional<llvm::Triple> getTargetTriple(FuncOp entryPointFn) {
-  auto variantOp =
-      entryPointFn->getParentOfType<IREE::HAL::ExecutableVariantOp>();
-  IREE::HAL::ExecutableTargetAttr targetAttr = variantOp.target();
-  if (!targetAttr) return llvm::None;
-  auto config = targetAttr.getConfiguration();
-  if (!config) return llvm::None;
-  auto triple = config.getAs<StringAttr>("target_triple");
-  if (!triple) return llvm::None;
-  return llvm::Triple(triple.getValue().str());
-}
-
-static DispatchLoweringPassPipeline getDispatchLoweringPassPipeline(
-    FuncOp entryPointFn, Operation *op) {
-  return TypeSwitch<Operation *, DispatchLoweringPassPipeline>(op)
-      .Case<linalg::ContractionOpInterface, linalg::Mmt4DOp>([&](auto op) {
-        return DispatchLoweringPassPipeline::CPUTileFuseAndVectorize;
-      })
-      .Default([&](Operation *op) {
-        return DispatchLoweringPassPipeline::CPUDefault;
-      });
-}
-
 /// Looks for the `native_vector_size` attribute in the hal.executable.variant
 /// op.
 static Optional<int64_t> getNativeVectorSizeInBytes(FuncOp entryPointFn) {
@@ -233,6 +209,61 @@
   return workloadPerWorkgroup;
 }
 
+/// Adjusts the workload per workgroup to be a multiple of vector size to ensure
+/// that the op vectorizes.
+static int64_t getMaxTileSize(int64_t lb, int64_t ub, int64_t maxSize,
+                              int64_t vectorSizeVal) {
+  if (ub == ShapedType::kDynamicSize || lb == ShapedType::kDynamicSize) {
+    return maxSize;
+  }
+  int64_t dim = ub - lb;
+  if (dim < vectorSizeVal) return 0;
+  for (int64_t i = std::min(maxSize, dim); i > 0; --i) {
+    if (dim % i == 0 && i % vectorSizeVal == 0) {
+      return i;
+    }
+  }
+  return maxSize;
+}
+
+/// Compute the workload per workgroup. The `vectorSize` is expected to contain
+/// the vector size to use along each loop of the `interfaceOp`.
+static SmallVector<int64_t> getDefaultWorkloadPerWorkgroup(
+    ArrayRef<LoopTilingAndDistributionInfo> tiledLoops,
+    ArrayRef<unsigned> partitionedLoops, ArrayRef<int64_t> vectorSize) {
+  if (tiledLoops.empty()) {
+    // Nothing to do.
+    return {};
+  }
+
+  assert(partitionedLoops.size() == tiledLoops.size() &&
+         "mismatch in expected parallelization");
+  SmallVector<int64_t> partitionedLoopsVectorSize(tiledLoops.size(), 1);
+  for (auto loopDim : llvm::enumerate(partitionedLoops)) {
+    partitionedLoopsVectorSize[loopDim.index()] = vectorSize[loopDim.value()];
+  }
+
+  SmallVector<int64_t> workLoadPerWorkgroup =
+      getDefaultWorkloadPerWorkgroup(tiledLoops, partitionedLoopsVectorSize);
+  for (auto tiledLoop : llvm::enumerate(tiledLoops)) {
+    Optional<int64_t> lb =
+        getConstantIntValue(tiledLoop.value().untiledLowerBound);
+    Optional<int64_t> ub =
+        getConstantIntValue(tiledLoop.value().untiledUpperBound);
+    if (!lb || !ub) continue;
+    unsigned workloadIndex = tiledLoops.size() - 1 - tiledLoop.index();
+    workLoadPerWorkgroup[workloadIndex] = getMaxTileSize(
+        lb.getValue(), ub.getValue(), workLoadPerWorkgroup[workloadIndex],
+        partitionedLoopsVectorSize[tiledLoop.index()]);
+    if (workLoadPerWorkgroup[workloadIndex] == 0) {
+      // If the tile size chosen is 0 set the workLoadPerWorkgroup to problem
+      // size.
+      workLoadPerWorkgroup[workloadIndex] = ub.getValue() - lb.getValue();
+    }
+  }
+  return workLoadPerWorkgroup;
+}
+
 /// Sets the default launch configuration to use for a tiled + distributed
 /// dispatch region based on the `tiledLoops` found.
 static LogicalResult setDefaultLaunchConfig(
@@ -253,39 +284,20 @@
   return success();
 }
 
-/// Adjusts the workload per workgroup to be a multiple of vector size to ensure
-/// that the op vectorizes.
-static int64_t getMaxTileSize(int64_t lb, int64_t ub, int64_t maxSize,
-                              int64_t vectorSizeVal) {
-  if (ub == ShapedType::kDynamicSize || lb == ShapedType::kDynamicSize) {
-    return maxSize;
-  }
-  int64_t dim = ub - lb;
-  if (dim < vectorSizeVal) return vectorSizeVal;
-  for (int64_t i = std::min(maxSize, dim); i > 0; --i) {
-    if (dim % i == 0 && i % vectorSizeVal == 0) {
-      return i;
-    }
-  }
-  return maxSize;
-}
-
-static LogicalResult setX86SandboxRootConfig(
-    FuncOp entryPointFn, linalg::ContractionOpInterface op,
-    SmallVector<int64_t> workloadPerWorkgroup, int vectorSize) {
-  setTranslationInfo(entryPointFn,
-                     DispatchLoweringPassPipeline::CPUDoubleTilingExpert,
-                     workloadPerWorkgroup,
-                     /*workgroupSize=*/ArrayRef<int64_t>{});
-
+static LogicalResult setX86SandboxRootConfig(FuncOp entryPointFn,
+                                             linalg::ContractionOpInterface op,
+                                             ArrayRef<int64_t> flowTileSizes,
+                                             int vectorSize) {
   // Hardcoded tile sizes. The configuration is derived from iree-llvm-sandbox.
   // L1 tile sizes are {1, ..., 8, 32, 16}
   SmallVector<int64_t> l1TileSizes;
   int64_t nLoops = cast<linalg::LinalgOp>(op.getOperation()).getNumLoops();
   l1TileSizes.append(nLoops - 3, 1);
-  l1TileSizes.push_back(8);
-  l1TileSizes.push_back(32);
-  l1TileSizes.push_back(16);
+  l1TileSizes.push_back(getMaxTileSize(0, flowTileSizes[nLoops - 3], 8, 8));
+  l1TileSizes.push_back(getMaxTileSize(0, flowTileSizes[nLoops - 2], 32, 32));
+  auto lhsShapedType = op.lhs().getType().cast<ShapedType>();
+  int64_t K = lhsShapedType.getShape().back();
+  l1TileSizes.push_back(getMaxTileSize(0, K, 16, 16));
 
   TileSizesListType tileSizes;
   tileSizes.push_back({});
@@ -299,12 +311,7 @@
 
 static LogicalResult setX86TileFuseAndVectorizeRootConfig(
     FuncOp entryPointFn, linalg::ContractionOpInterface op,
-    SmallVector<int64_t> workloadPerWorkgroup, int vectorSize) {
-  setTranslationInfo(entryPointFn,
-                     DispatchLoweringPassPipeline::CPUTileFuseAndVectorize,
-                     workloadPerWorkgroup,
-                     /*workgroupSize=*/ArrayRef<int64_t>{});
-
+    ArrayRef<int64_t> flowTileSizes, int vectorSize) {
   // Hardcoded tile sizes, where v is the native vector size.
   // L1 tile sizes are {1, 1, ..., 8, 2v, 2v}.
   // Vector tile sizes are {1, ..., 1, v, v}
@@ -312,9 +319,9 @@
   int64_t nLoops = cast<linalg::LinalgOp>(op.getOperation()).getNumLoops();
   l1TileSizes.append(nLoops - 3, 1);
   l1TileSizes.push_back(
-      getMaxTileSize(0, workloadPerWorkgroup[1], 8, vectorSize));
+      getMaxTileSize(0, flowTileSizes[nLoops - 3], 8, vectorSize));
   l1TileSizes.push_back(
-      getMaxTileSize(0, workloadPerWorkgroup[0], 2 * vectorSize, vectorSize));
+      getMaxTileSize(0, flowTileSizes[nLoops - 2], 2 * vectorSize, vectorSize));
   vectorTileSizes.append(nLoops - 2, 1);
   vectorTileSizes.push_back(vectorSize);
 
@@ -337,13 +344,8 @@
 
 static LogicalResult setARMRootConfig(FuncOp entryPointFn,
                                       linalg::ContractionOpInterface op,
-                                      SmallVector<int64_t> workloadPerWorkgroup,
+                                      ArrayRef<int64_t> flowTileSizes,
                                       int vectorSize) {
-  setTranslationInfo(entryPointFn,
-                     getDispatchLoweringPassPipeline(entryPointFn, op),
-                     workloadPerWorkgroup,
-                     /*workgroupSize=*/ArrayRef<int64_t>{});
-
   // Hardcoded tile sizes, where v is the native vector size.
   // L1 tile sizes are {1, ..., 5v, v, 16v}.
   // Vector tile sizes are {1, ..., v, v, v}
@@ -351,9 +353,9 @@
   int64_t nLoops = cast<linalg::LinalgOp>(op.getOperation()).getNumLoops();
   l1TileSizes.append(nLoops - 3, 1);
   l1TileSizes.push_back(
-      getMaxTileSize(0, workloadPerWorkgroup[1], 5 * vectorSize, vectorSize));
+      getMaxTileSize(0, flowTileSizes[nLoops - 3], 5 * vectorSize, vectorSize));
   l1TileSizes.push_back(
-      getMaxTileSize(0, workloadPerWorkgroup[0], vectorSize, vectorSize));
+      getMaxTileSize(0, flowTileSizes[nLoops - 2], vectorSize, vectorSize));
   vectorTileSizes.append(nLoops - 3, 1);
   vectorTileSizes.push_back(vectorSize);
   vectorTileSizes.push_back(vectorSize);
@@ -382,35 +384,40 @@
     ArrayRef<LoopTilingAndDistributionInfo> tiledLoops) {
   auto lhsShapedType = contractionOp.lhs().getType().cast<ShapedType>();
   // Use the default distribution for the matmul loops.
-  int numBatchDims =
-      cast<linalg::LinalgOp>(contractionOp.getOperation()).getNumLoops() - 3;
+  unsigned numBatchDims = 0;
+  auto interfaceOp = cast<IREE::Flow::PartitionableLoopsInterface>(
+      contractionOp.getOperation());
+  unsigned numLoops = interfaceOp.getNumLoops();
+  SmallVector<unsigned> partitionedLoops =
+      interfaceOp.getPartitionableLoops(kNumMaxParallelDims);
+  // The batch dim is distributed if numLoops > 3 and partitionedLoops.begin()
+  // == 0.
+  if (numLoops > 3 && !partitionedLoops.empty() && partitionedLoops[0] == 0) {
+    numBatchDims = 1;
+  }
 
   int64_t vectorSize = getVectorSize(entryPointFn, lhsShapedType);
-  SmallVector<int64_t> vectorSizeVals(tiledLoops.size(), 1);
+  SmallVector<int64_t> vectorSizeVals(numLoops, 1);
   vectorSizeVals.back() = vectorSize;
   vectorSizeVals[vectorSizeVals.size() - 2] = vectorSize;
+  vectorSizeVals[vectorSizeVals.size() - 3] = vectorSize;
 
   SmallVector<int64_t> workloadPerWorkgroup = getDefaultWorkloadPerWorkgroup(
       tiledLoops.drop_front(numBatchDims),
+      ArrayRef<unsigned>(partitionedLoops).drop_front(numBatchDims),
       ArrayRef<int64_t>(vectorSizeVals).drop_front(numBatchDims));
-
-  for (unsigned i = tiledLoops.size() - 2; i < tiledLoops.size(); ++i) {
-    if (!tiledLoops[i].untiledLowerBound.is<Attribute>() ||
-        !tiledLoops[i].untiledUpperBound.is<Attribute>()) {
-      continue;
-    }
-    auto lb =
-        tiledLoops[i].untiledLowerBound.get<Attribute>().cast<IntegerAttr>();
-    auto ub =
-        tiledLoops[i].untiledUpperBound.get<Attribute>().cast<IntegerAttr>();
-    workloadPerWorkgroup[tiledLoops.size() - 1 - i] = getMaxTileSize(
-        lb.getInt(), ub.getInt(),
-        workloadPerWorkgroup[tiledLoops.size() - 1 - i], vectorSizeVals[i]);
+  if (numBatchDims) {
+    workloadPerWorkgroup.push_back(1);
   }
-  workloadPerWorkgroup.append(numBatchDims, 1);
 
-  Optional<llvm::Triple> triple = getTargetTriple(entryPointFn);
-  if (triple && triple.getValue().isX86()) {
+  SmallVector<int64_t> flowTileSizes =
+      getDistributedTileSizes(interfaceOp, workloadPerWorkgroup);
+
+  Optional<DispatchLoweringPassPipeline> passPipeline = {};
+  if (isX86(entryPointFn)) {
+    setTranslationInfo(
+        entryPointFn, DispatchLoweringPassPipeline::CPUDoubleTilingExpert,
+        workloadPerWorkgroup, /*workgroupSize=*/ArrayRef<int64_t>{});
     // There is a tileInterchange option. If it needs to be configured, we can
     // only apply the pipeline to linalg.matmul. Because we don't know the
     // number of loops when adding the pass to pass manager.
@@ -421,16 +428,35 @@
     Type resElemType =
         getElementTypeOrSelf(contractionOp->getResult(0).getType());
     if (lhsElemType == rhsElemType && rhsElemType == resElemType) {
-      return setX86SandboxRootConfig(entryPointFn, contractionOp,
-                                     workloadPerWorkgroup, vectorSize);
+      passPipeline = DispatchLoweringPassPipeline::CPUDoubleTilingExpert;
+      if (failed(setX86SandboxRootConfig(entryPointFn, contractionOp,
+                                         flowTileSizes, vectorSize))) {
+        return failure();
+      }
     } else {
-      return setX86TileFuseAndVectorizeRootConfig(
-          entryPointFn, contractionOp, workloadPerWorkgroup, vectorSize);
+      passPipeline = DispatchLoweringPassPipeline::CPUTileFuseAndVectorize;
+      if (failed(setX86TileFuseAndVectorizeRootConfig(
+              entryPointFn, contractionOp, flowTileSizes, vectorSize))) {
+        return failure();
+      }
+    }
+  } else {
+    // Fall back to ARM configurations.
+    passPipeline = DispatchLoweringPassPipeline::CPUTileFuseAndVectorize;
+    if (failed(setARMRootConfig(entryPointFn, contractionOp, flowTileSizes,
+                                vectorSize))) {
+      return failure();
     }
   }
-  // Fall back to ARM configurations.
-  return setARMRootConfig(entryPointFn, contractionOp, workloadPerWorkgroup,
-                          vectorSize);
+
+  if (!passPipeline) {
+    // Do nothing.
+    return success();
+  }
+  setTranslationInfo(entryPointFn, passPipeline.getValue(),
+                     workloadPerWorkgroup,
+                     /*workgroupSize=*/ArrayRef<int64_t>{});
+  return success();
 }
 
 /// Sets the lowering configuration for dispatch region for linalg.mmt4d root
@@ -482,7 +508,7 @@
 
   return setOpConfigAndEntryPointFnTranslation(
       entryPointFn, mmt4dOp, tileSizes, nativeVectorSize,
-      getDispatchLoweringPassPipeline(entryPointFn, mmt4dOp));
+      DispatchLoweringPassPipeline::CPUTileFuseAndVectorize);
 }
 
 /// Sets the lowering configuration for dispatch region for linalg_ext.fft
@@ -490,7 +516,9 @@
 static LogicalResult setRootConfig(
     FuncOp entryPointFn, IREE::LinalgExt::FftOp fftOp,
     ArrayRef<LoopTilingAndDistributionInfo> tiledLoops) {
-  auto partitionedLoops = getPartitionedLoops(fftOp);
+  auto interfaceOp = cast<IREE::Flow::PartitionableLoopsInterface>(*fftOp);
+  auto partitionedLoops =
+      interfaceOp.getPartitionableLoops(kNumMaxParallelDims);
   unsigned maxDepth = partitionedLoops.back() + 1;
   SmallVector<int64_t> workgroupTileSizes(maxDepth, defaultWorkgroupTileSize);
   llvm::DenseSet<unsigned> partitionedLoopsSet(partitionedLoops.begin(),
@@ -519,7 +547,7 @@
   return setOpConfigAndEntryPointFnTranslation(
       entryPointFn, fftOp, tileSizes,
       /*nativeVectorSizes=*/ArrayRef<int64_t>{},
-      getDispatchLoweringPassPipeline(entryPointFn, fftOp));
+      DispatchLoweringPassPipeline::CPUDefault);
 }
 
 /// Sets the lowering configuration for a generic op to use SingleTilingExpert.
@@ -554,16 +582,11 @@
   }
 
   // Set the flow level tiling to the default.
-  SmallVector<int64_t> prunedNativeVectorSize(tiledLoops.size(), 1);
-  if (!tiledLoops.empty()) {
-    SmallVector<unsigned> partitionedLoops = getPartitionedLoops(genericOp);
-    for (auto loopDim : llvm::enumerate(partitionedLoops)) {
-      prunedNativeVectorSize[loopDim.index()] =
-          nativeVectorSize[loopDim.value()];
-    }
-  }
-  SmallVector<int64_t> workloadPerWorkgroup =
-      getDefaultWorkloadPerWorkgroup(tiledLoops, prunedNativeVectorSize);
+  SmallVector<unsigned> partitionedLoops =
+      cast<IREE::Flow::PartitionableLoopsInterface>(genericOp.getOperation())
+          .getPartitionableLoops(kNumMaxParallelDims);
+  SmallVector<int64_t> workloadPerWorkgroup = getDefaultWorkloadPerWorkgroup(
+      tiledLoops, partitionedLoops, nativeVectorSize);
   setTranslationInfo(entryPointFn,
                      DispatchLoweringPassPipeline::CPUSingleTilingExpert,
                      workloadPerWorkgroup,
@@ -627,10 +650,10 @@
   }
   if (rootOp) return success();
 
-  // If there are any other ops other than `linalg.generic`, `linalg.copy` or
+  // If there are any other ops other than `linalg.generic`, `linalg.generic` or
   // `linalg.fill` then just use the default.
   for (auto computeOp : computeOps) {
-    if (!isa<linalg::GenericOp, linalg::CopyOp, linalg::FillOp>(computeOp)) {
+    if (!isa<linalg::GenericOp, linalg::FillOp>(computeOp)) {
       return success();
     }
   }

diff --git a/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileFuseAndVectorizeLinalgTensorOps.cpp b/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileFuseAndVectorizeLinalgTensorOps.cpp
index 1a5e806..88f5c44 100644
--- a/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileFuseAndVectorizeLinalgTensorOps.cpp
+++ b/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileFuseAndVectorizeLinalgTensorOps.cpp

@@ -11,6 +11,7 @@
 #include "iree/compiler/Codegen/Transforms/Transforms.h"
 #include "iree/compiler/Codegen/Utils/MarkerUtils.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
+#include "iree/compiler/Dialect/HAL/Utils/InferCustomKernelsTargetInfoFromParent.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/Conversion/VectorToSCF/VectorToSCF.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
@@ -19,7 +20,7 @@
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
 #include "mlir/Dialect/SCF/Transforms.h"
-#include "mlir/Dialect/Vector/VectorTransforms.h"
+#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
@@ -30,7 +31,7 @@
 
 // A flag to switch between inline asm and intrinsics while we develop these two
 //  parallel paths.
-static llvm::cl::opt<bool> clUseMmt4dUseIntrinsics(
+static llvm::cl::opt<bool> clMmt4dUseIntrinsics(
     "iree-codegen-mmt4d-use-intrinsics",
     llvm::cl::desc("Whether to use instrinsics when lowering vector contracts "
                    "generated from mmt4d matmuls (as opposed to inline asm). "
@@ -129,16 +130,22 @@
       }
     }
     assert(consumerOp && "can't find consumerOp");
-    SmallVector<int64_t> consumerTileSize(
+    // Only the parallel loops of the consumer can be tiled for fusion.
+    SmallVector<int64_t> consumerTileSizes(
         tileSizes.begin(),
         tileSizes.begin() + consumerOp.getNumParallelLoops());
-    auto identityIndicesOrder =
-        llvm::to_vector<4>(llvm::seq<int64_t>(0, consumerTileSize.size()));
-    FailureOr<linalg::TileLoopNest> tileLoopNest =
-        linalg::tileConsumerAndFuseProducers(
-            builder, consumerOp, consumerTileSize, identityIndicesOrder);
-    if (failed(tileLoopNest)) return signalPassFailure();
-    consumerOp->replaceAllUsesWith(tileLoopNest->getRootOpReplacementResults());
+    // TODO: The fusion method failes to handle the corner case when no tiling
+    // is needed and segfaults/asserts. So guard it for now.
+    if (llvm::any_of(consumerTileSizes, [](int64_t v) { return v; })) {
+      auto identityIndicesOrder =
+          llvm::to_vector<4>(llvm::seq<int64_t>(0, consumerTileSizes.size()));
+      FailureOr<linalg::TileLoopNest> tileLoopNest =
+          linalg::tileConsumerAndFuseProducers(
+              builder, consumerOp, consumerTileSizes, identityIndicesOrder);
+      if (failed(tileLoopNest)) return signalPassFailure();
+      consumerOp->replaceAllUsesWith(
+          tileLoopNest->getRootOpReplacementResults());
+    }
 
     // Apply canoncalization
     if (failed(applyTileAndFuseCanonicalizationPatterns(funcOp))) {
@@ -278,9 +285,9 @@
     linalg::LinalgVectorizationOptions opt;
     linalg::LinalgTransformationFilter f(
         StringAttr::get(context, getVectorizeMarker()));
-    linalg::VectorizationPatterns<linalg::GenericOp, linalg::CopyOp,
-                                  linalg::FillOp>::insert(vectorizationPatterns,
-                                                          opt, f);
+    linalg::VectorizationPatterns<linalg::GenericOp, linalg::FillOp>::insert(
+        vectorizationPatterns, opt, f);
+    vectorizationPatterns.add<linalg::CopyVectorizationPattern>(context);
     vectorizationPatterns.add<linalg::LinalgVectorizationPattern>(
         &getContext(), f.addOpFilter<linalg::ContractionOpInterface>(), opt);
     vector::populateVectorTransferPermutationMapLoweringPatterns(
@@ -351,7 +358,9 @@
     // just before the generic vector ops lowerings.
     CustomKernelsTargetInfo info;
     if (succeeded(InferCustomKernelsTargetInfoFromParent(funcOp, info))) {
-      info.intrinsics = clUseMmt4dUseIntrinsics;
+      if (clMmt4dUseIntrinsics) {
+        info.add(CustomKernelTargetFeature::Intrinsics);
+      }
       RewritePatternSet patterns(context);
       populateVectorContractCustomKernelsPatterns(info, patterns);
       if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns)))) {

diff --git a/iree/compiler/Codegen/LLVMCPU/Passes.cpp b/iree/compiler/Codegen/LLVMCPU/Passes.cpp
index 0131ae1..32ca678 100644
--- a/iree/compiler/Codegen/LLVMCPU/Passes.cpp
+++ b/iree/compiler/Codegen/LLVMCPU/Passes.cpp

@@ -12,6 +12,7 @@
 #include "iree/compiler/Codegen/PassDetail.h"
 #include "iree/compiler/Codegen/Sandbox/Passes.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
+#include "iree/compiler/Dialect/Flow/IR/PartitionableLoopsInterface.h"
 #include "llvm/Support/CommandLine.h"
 #include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"
 #include "mlir/Conversion/SCFToStandard/SCFToStandard.h"
@@ -47,22 +48,26 @@
 }
 
 // Allocation callbacks to use with upstream comprehensive bufferization
-static Optional<Value> cpuComprehensiveBufferizeAllocationFn(
+static FailureOr<Value> cpuComprehensiveBufferizeAllocationFn(
     OpBuilder &builder, Location loc, MemRefType memRefType,
-    ArrayRef<Value> dynamicSizes) {
-  return builder.create<memref::AllocaOp>(loc, memRefType, dynamicSizes)
+    ValueRange dynamicSizes, unsigned alignment) {
+  return builder
+      .create<memref::AllocaOp>(loc, memRefType, dynamicSizes,
+                                builder.getI64IntegerAttr(alignment))
       .getResult();
 }
 
-static void cpuComprehensiveBufferizeDeallocationFn(OpBuilder &builder,
-                                                    Location loc,
-                                                    Value allocation) {
-  return;
+static LogicalResult cpuComprehensiveBufferizeDeallocationFn(OpBuilder &builder,
+                                                             Location loc,
+                                                             Value allocation) {
+  return success();
 }
 
-static void cpuComprehensiveBufferizeCopyFn(OpBuilder &builder, Location loc,
-                                            Value from, Value to) {
-  builder.create<linalg::CopyOp>(loc, from, to);
+static LogicalResult cpuComprehensiveBufferizeCopyFn(OpBuilder &builder,
+                                                     Location loc, Value from,
+                                                     Value to) {
+  createLinalgCopyOp(builder, loc, from, to);
+  return success();
 }
 
 //===---------------------------------------------------------------------===//
@@ -87,60 +92,67 @@
            << pipelineName;
   }
 
+  if (loweringConfig.getTileSizes().size() != 2) {
+    return op->emitOpError("expected two levels of tile sizes for ")
+           << pipelineName << ", got " << loweringConfig.getTileSizes().size();
+  }
+
   // Verify that the workload per workgroup is set and is non-zero.
   SmallVector<int64_t> workloadPerWorkgroup =
       translationInfo.getWorkloadPerWorkgroupVals();
   if (workloadPerWorkgroup.size() > kNumMaxParallelDims) {
-    return op->emitOpError("workload_per_wg size should be less than ")
+    return op->emitOpError(
+               "workload_per_wg size should be less than or equal to ")
            << kNumMaxParallelDims;
   }
-  if (isa<linalg::LinalgOp, IREE::LinalgExt::TiledOpInterface>(op)) {
-    SmallVector<unsigned> partitionedLoops = getPartitionedLoops(op);
-    if (workloadPerWorkgroup.size() != partitionedLoops.size()) {
-      return op->emitOpError("expected ")
-             << partitionedLoops.size()
-             << " entries for workload_per_wg, but got "
-             << workloadPerWorkgroup.size();
-    }
-  }
   if (llvm::any_of(workloadPerWorkgroup,
                    [](int64_t val) { return val == 0; })) {
     return op->emitOpError("invalid to use 0 in workload_per_wg");
   }
 
-  if (loweringConfig.getTileSizes().size() != 2) {
-    return op->emitOpError("expected two levels of tile sizes for ")
-           << pipelineName << ", got " << loweringConfig.getTileSizes().size();
-  }
-  SmallVector<int64_t> firstLevelTileSizes = loweringConfig.getTileSizeVals(
-      static_cast<unsigned>(TilingLevel::WorkGroupTiles));
-  if (!firstLevelTileSizes.empty()) {
-    // Verify that if the first-level tile sizes are set, they are the same as
-    // workload_per_wg for the partitioned loops.
-    SmallVector<unsigned> partitionedLoops = getPartitionedLoops(op);
-    size_t minElements =
-        (partitionedLoops.empty() ? 0 : partitionedLoops.back() + 1);
-    if (firstLevelTileSizes.size() < minElements) {
-      return op->emitOpError("expected at least ")
-             << minElements
-             << " size for first level tiling to get the distribution fully "
-                "specified.";
+  IREE::Flow::PartitionableLoopsInterface interfaceOp =
+      dyn_cast_or_null<IREE::Flow::PartitionableLoopsInterface>(op);
+  if (interfaceOp) {
+    SmallVector<unsigned> partitionedLoops =
+        interfaceOp.getPartitionableLoops(kNumMaxParallelDims);
+    if (workloadPerWorkgroup.size() != partitionedLoops.size()) {
+      return op->emitOpError("expected ")
+             << partitionedLoops.size()
+             << " entries for workload_per_wg, but got "
+             << workloadPerWorkgroup.size();
     }
-    llvm::SmallDenseSet<unsigned> partitionedLoopsSet;
-    partitionedLoopsSet.insert(partitionedLoops.begin(),
-                               partitionedLoops.end());
-    SmallVector<int64_t> partitionedTileSizes;
-    for (auto tileSize : llvm::enumerate(firstLevelTileSizes)) {
-      if (!partitionedLoopsSet.count(tileSize.index())) {
-        continue;
+    SmallVector<int64_t> firstLevelTileSizes = loweringConfig.getTileSizeVals(
+        static_cast<unsigned>(TilingLevel::WorkGroupTiles));
+
+    if (!firstLevelTileSizes.empty()) {
+      // Verify that if the first-level tile sizes are set, they are the same as
+      // workload_per_wg for the partitioned loops.
+      SmallVector<unsigned> partitionedLoops =
+          interfaceOp.getPartitionableLoops(kNumMaxParallelDims);
+      size_t minElements =
+          (partitionedLoops.empty() ? 0 : partitionedLoops.back() + 1);
+      if (firstLevelTileSizes.size() < minElements) {
+        return op->emitOpError("expected at least ")
+               << minElements
+               << " size for first level tiling to get the distribution fully "
+                  "specified.";
       }
-      partitionedTileSizes.push_back(tileSize.value());
-    }
-    for (auto val : llvm::enumerate(llvm::reverse(workloadPerWorkgroup))) {
-      if (val.value() != partitionedTileSizes[val.index()]) {
-        return op->emitOpError("mismatch in distributed tile size value ")
-               << partitionedTileSizes[val.index()] << " at position "
-               << val.index() << " and workload_per_wg value " << val.value();
+      llvm::SmallDenseSet<unsigned> partitionedLoopsSet;
+      partitionedLoopsSet.insert(partitionedLoops.begin(),
+                                 partitionedLoops.end());
+      SmallVector<int64_t> partitionedTileSizes;
+      for (auto tileSize : llvm::enumerate(firstLevelTileSizes)) {
+        if (!partitionedLoopsSet.count(tileSize.index())) {
+          continue;
+        }
+        partitionedTileSizes.push_back(tileSize.value());
+      }
+      for (auto val : llvm::enumerate(llvm::reverse(workloadPerWorkgroup))) {
+        if (val.value() != partitionedTileSizes[val.index()]) {
+          return op->emitOpError("mismatch in distributed tile size value ")
+                 << partitionedTileSizes[val.index()] << " at position "
+                 << val.index() << " and workload_per_wg value " << val.value();
+        }
       }
     }
   }
@@ -234,15 +246,13 @@
     passManager.addNestedPass<FuncOp>(createCSEPass());
   }
 
-  // TODO(ravishankarm): This is commented cause this is WIP, to be enabled
-  // soon.
-  // auto callbacks =
-  //     std::make_unique<linalg::comprehensive_bufferize::AllocationCallbacks>(
-  //         cpuComprehensiveBufferizeAllocationFn,
-  //         cpuComprehensiveBufferizeDeallocationFn,
-  //         cpuComprehensiveBufferizeCopyFn);
-  // addIREEComprehensiveBufferizePasses(passManager, std::move(callbacks));
-  addLinalgBufferizePasses(passManager, cpuAllocationFunction);
+  BufferizationOptions::AllocationFn allocationFn =
+      cpuComprehensiveBufferizeAllocationFn;
+  BufferizationOptions::DeallocationFn deallocationFn =
+      cpuComprehensiveBufferizeDeallocationFn;
+  BufferizationOptions::MemCpyFn memcpyFn = cpuComprehensiveBufferizeCopyFn;
+  addIREEComprehensiveBufferizePasses(passManager, allocationFn, deallocationFn,
+                                      memcpyFn);
 
   // Run IREE specific passes before vector lowering expert.
   passManager.addNestedPass<FuncOp>(createRemoveSingleIterationLoopPass());
@@ -298,6 +308,7 @@
       IREE::LinalgExt::createLinalgExtToLoopsPass());
 
   // Linalg -> SCF
+  passManager.addNestedPass<FuncOp>(createMemrefCopyToLinalgPass());
   passManager.addNestedPass<FuncOp>(createConvertLinalgToLoopsPass());
   passManager.addNestedPass<FuncOp>(createCanonicalizerPass());
   passManager.addNestedPass<FuncOp>(createCSEPass());
@@ -311,7 +322,7 @@
     passManager.addPass(createLLVMCPUCheckIRBeforeLLVMConversionPass());
   }
   // Handled tensor-type constants.
-  passManager.addPass(createTensorConstantBufferizePass());
+  passManager.addPass(arith::createConstantBufferizePass());
   passManager.addPass(createFoldTensorExtractOpPass());
 
   // math dialect elementry functions -> polynomial form.

diff --git a/iree/compiler/Codegen/LLVMCPU/VectorContractCustomKernels.cpp b/iree/compiler/Codegen/LLVMCPU/VectorContractCustomKernels.cpp
index 0881503..62613a5 100644
--- a/iree/compiler/Codegen/LLVMCPU/VectorContractCustomKernels.cpp
+++ b/iree/compiler/Codegen/LLVMCPU/VectorContractCustomKernels.cpp

@@ -6,16 +6,21 @@
 
 #include "iree/compiler/Codegen/PassDetail.h"
 #include "iree/compiler/Codegen/Passes.h"
+#include "iree/compiler/Utils/CustomKernelsTargetInfo.h"
+#include "iree/compiler/Utils/StringUtils.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/MathExtras.h"
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
 #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
 #include "mlir/Dialect/ArmNeon/ArmNeonDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/Dialect/Vector/VectorOps.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -23,53 +28,6 @@
 namespace mlir {
 namespace iree_compiler {
 
-LogicalResult InferCustomKernelsTargetInfoFromParent(
-    FuncOp entryPointFn, CustomKernelsTargetInfo &target_info) {
-  // Set the out-value to defaults early so that early returns produce
-  // consistent results and so that we can write simpler code below
-  // (for loop OR-ing booleans, assuming initial 'false' value).
-  target_info = CustomKernelsTargetInfo();
-
-  // Try to find the parent ExecutableVariantOp and its relevant attributes.
-  auto variantOp =
-      entryPointFn->getParentOfType<IREE::HAL::ExecutableVariantOp>();
-  if (!variantOp) {
-    return failure();
-  }
-  IREE::HAL::ExecutableTargetAttr targetAttr = variantOp.target();
-  if (!targetAttr) {
-    return failure();
-  }
-  auto config = targetAttr.getConfiguration();
-  if (!config) {
-    return failure();
-  }
-  auto tripleAttr = config.getAs<StringAttr>("target_triple");
-  if (!tripleAttr) {
-    return failure();
-  }
-  auto cpuFeaturesAttr = config.getAs<StringAttr>("cpu_features");
-  if (!cpuFeaturesAttr) {
-    return failure();
-  }
-
-  // Set the out-value target_info fields.
-  llvm::Triple triple(tripleAttr.getValue());
-  llvm::SmallVector<llvm::StringRef> cpuFeatures;
-  cpuFeaturesAttr.getValue().split(cpuFeatures, ',');
-  switch (triple.getArch()) {
-    case llvm::Triple::ArchType::aarch64:
-      target_info.aarch64 = true;
-      for (auto f : cpuFeatures) {
-        target_info.dotprod |= (f == "+dotprod");
-      }
-      break;
-    default:
-      break;
-  }
-  return success();
-}
-
 namespace {
 
 // Returns true if `contractionOp` is of the form
@@ -154,7 +112,8 @@
 // Note that this only looks at the immediately defining operation, so we likely
 // want to have earlier passes that sink widening operations as far down as
 // possible, which is probably just good regardless.
-static Value getExtSIInput(Type extSrcType, Type extDstType, Value extResult) {
+static Value getUnpromotedInput(Type extSrcType, Type extDstType,
+                                Value extResult) {
   auto extSIOp = extResult.getDefiningOp<arith::ExtSIOp>();
   if (!extSIOp) {
     return nullptr;
@@ -185,8 +144,331 @@
   return rewriter.create<vector::ShapeCastOp>(loc, dstType, vector);
 }
 
-/// Converts matrix-times-matrix-transposed vector.contracts with
-/// lhs and rhs inputs defined by arith.extsi promoting from i8 to i32,
+// Asserts that i is a power of two, and returns its log2.
+// Note: the llvm helpers used internally operate on uint32, but we keep that
+// an internal detail as the surrounding code here is all operating on signed
+// integers and mixing signed and unsigned would be error-prone.
+int8_t exactLog2(int32_t i) {
+  assert(i > 0);
+  uint32_t u = i;
+  assert(llvm::isPowerOf2_32(u));
+  return llvm::countTrailingZeros(u);
+}
+
+// Helper to handle powers of two size computations without the overhead
+// of runtime divisions. Divisions remain very expensive compared to most other
+// instructions.  Divisions are of course cheap when the divisor is
+// a constant, but a typical use case for us is
+//
+//    lhsBitWidth / kernel.registerBitWidth
+//
+// kernel.registerBitWidth is *initialized* from a literal value (say 128) but
+// it would be cumbersome to have to preserve its constant-expression status
+// throughout.
+class PowerOfTwo {
+ private:
+  int8_t exponent = 0;
+
+ public:
+  PowerOfTwo() {}
+  explicit PowerOfTwo(int32_t i) : exponent(exactLog2(i)) {}
+  int getExponent() const { return exponent; }
+  int val() const {
+    assert(exponent < 8 * sizeof(int) - 1);
+    return 1 << exponent;
+  }
+};
+
+// Returns i/p, asserting that p divides i. Requires i >= 0.
+// Fast: bit shift, not actual div.
+int32_t fastExactDiv(int32_t i, PowerOfTwo p) {
+  assert(i >= 0 && "exact log of negative number");
+  int32_t result = i >> p.getExponent();
+  assert(result << p.getExponent() == i && "exact log of non-power of two");
+  return result;
+}
+
+int32_t operator*(int32_t i, PowerOfTwo p) {
+  assert(i >= 0 && "only nonnegative values are supported");
+  uint32_t u = i;
+  assert(llvm::countLeadingZeros(u) > static_cast<unsigned>(p.getExponent()));
+  (void)u;
+  return i << p.getExponent();
+}
+
+// Describes a kernel. This struct is kept small to separate the kernels
+// themselves from the MLIR-specific generators consuming them
+// (see MMTKernelGenerator).
+struct MMTKernel {
+  enum class ScalarType : int8_t { None, I8, I32, F32 };
+  // Target architecture. Needed to generate inline asm constraints.
+  CustomKernelTargetArch arch = CustomKernelTargetArch::None;
+  // Bit width of the Simd registers used by the kernel. Needed to determine
+  // how to slice Vectors into register-sized Vectors. Not in general fully
+  // determined by the arch as it's typical for each arch to have different
+  // collections of SIMD instructions with different widths.
+  PowerOfTwo registerBitWidth;
+  // Element type of the LHS vectors.
+  ScalarType lhsType = ScalarType::None;
+  // Element type of the RHS vectors.
+  ScalarType rhsType = ScalarType::None;
+  // Element type of the Accumulator and output vectors.
+  ScalarType accType = ScalarType::None;
+  // Number of rows of the LHS and Accumulator tile.
+  int8_t m0 = 0;
+  // Reduction dimension, i.e. number of columns of the LHS.
+  int8_t k0 = 0;
+  // Number of rows of the RHS (note that the operation being targeted, MMT,
+  // is matrix multiplication with a *transposed* RHS)
+  int8_t n0 = 0;
+  // If not null, points to the inline asm code template for this kernel.
+  // Register operands for the LHS, RHS and Accumulator are to be referenced as
+  // $(lhs:<i>), $(rhs:<i>), $(acc:<i>) respectively, where i is a decimal
+  // integer specifying the i-th register for each case (numbered independently,
+  // so each starts at 0).
+  const char *implAsm = nullptr;
+};
+
+// It's not the end of the world to grow this, but let's be mindful as we have
+// so far made the choice to pass MMTKernels by value.
+static_assert(sizeof(MMTKernel) == 8 + sizeof(void *), "");
+
+// i8*i8->i32 kernel for Aarch64 NEON +dotprod
+MMTKernel MMTKernel_8x4x8_i8i8i32_Aarch64Dotprod_InlineAsm() {
+  MMTKernel kernel;
+  kernel.arch = CustomKernelTargetArch::Aarch64;
+  kernel.m0 = 8;
+  kernel.k0 = 4;
+  kernel.n0 = 8;
+  kernel.lhsType = MMTKernel::ScalarType::I8;
+  kernel.rhsType = MMTKernel::ScalarType::I8;
+  kernel.accType = MMTKernel::ScalarType::I32;
+  kernel.registerBitWidth = PowerOfTwo(128);
+  kernel.implAsm = R"ASM(
+      sdot $(acc:0).4s, $(rhs:0).16b, $(lhs:0).4b[0]
+      sdot $(acc:1).4s, $(rhs:1).16b, $(lhs:0).4b[0]
+      sdot $(acc:2).4s, $(rhs:0).16b, $(lhs:0).4b[1]
+      sdot $(acc:3).4s, $(rhs:1).16b, $(lhs:0).4b[1]
+      sdot $(acc:4).4s, $(rhs:0).16b, $(lhs:0).4b[2]
+      sdot $(acc:5).4s, $(rhs:1).16b, $(lhs:0).4b[2]
+      sdot $(acc:6).4s, $(rhs:0).16b, $(lhs:0).4b[3]
+      sdot $(acc:7).4s, $(rhs:1).16b, $(lhs:0).4b[3]
+      sdot $(acc:8).4s, $(rhs:0).16b, $(lhs:1).4b[0]
+      sdot $(acc:9).4s, $(rhs:1).16b, $(lhs:1).4b[0]
+      sdot $(acc:10).4s, $(rhs:0).16b, $(lhs:1).4b[1]
+      sdot $(acc:11).4s, $(rhs:1).16b, $(lhs:1).4b[1]
+      sdot $(acc:12).4s, $(rhs:0).16b, $(lhs:1).4b[2]
+      sdot $(acc:13).4s, $(rhs:1).16b, $(lhs:1).4b[2]
+      sdot $(acc:14).4s, $(rhs:0).16b, $(lhs:1).4b[3]
+      sdot $(acc:15).4s, $(rhs:1).16b, $(lhs:1).4b[3]
+    )ASM";
+  return kernel;
+}
+
+// Returns the bit-width ( = 8 * sizeof ) of the given scalar type.
+PowerOfTwo bitWidth(MMTKernel::ScalarType t) {
+  switch (t) {
+    case MMTKernel::ScalarType::None:
+      break;
+    case MMTKernel::ScalarType::I8:
+      return PowerOfTwo(8);
+    case MMTKernel::ScalarType::I32:
+      return PowerOfTwo(32);
+    case MMTKernel::ScalarType::F32:
+      return PowerOfTwo(32);
+  }
+  assert(false);
+  return PowerOfTwo();
+}
+
+// Constructs the mlir::Type corresponding to a scalar type.
+Type mlirType(MLIRContext *context, MMTKernel::ScalarType t) {
+  switch (t) {
+    case MMTKernel::ScalarType::None:
+      break;
+    case MMTKernel::ScalarType::I8:
+      return IntegerType::get(context, 8, IntegerType::Signless);
+    case MMTKernel::ScalarType::I32:
+      return IntegerType::get(context, 32, IntegerType::Signless);
+    case MMTKernel::ScalarType::F32:
+      return FloatType::getF32(context);
+  }
+  assert(false);
+  return Type();
+}
+
+// This class is a helper for patterns generating custom kernels based on
+// MMTKernel structs.
+class MMTKernelGenerator {
+ public:
+  MMTKernelGenerator(MLIRContext *context, MMTKernel kernel)
+      : context(context), kernel(kernel) {}
+  // Generates the kernel. Returns the output accumulator values.
+  SmallVector<Value> generate(PatternRewriter &rewriter, Location loc,
+                              ArrayRef<Value> lhs, ArrayRef<Value> rhs,
+                              ArrayRef<Value> acc) {
+    validateOperands(lhs, rhs, acc);
+    if (kernel.implAsm) {
+      return generateAsm(rewriter, loc, lhs, rhs, acc);
+    }
+    // In the future we may have alternate generator paths, e.g. 1D intrinsics
+    // or other asm paths with a different interface, e.g. handling also
+    // the memory load accesses.
+    assert(false && "no implementation provided for kernel");
+    return {};
+  }
+  // Returns the number of SIMD registers needed for the LHS
+  int getLhsRegsCount() const {
+    int lhsBitWidth = kernel.m0 * kernel.k0 * bitWidth(kernel.lhsType);
+    return fastExactDiv(lhsBitWidth, kernel.registerBitWidth);
+  }
+  // Returns the number of SIMD registers needed for the RHS
+  int getRhsRegsCount() const {
+    int rhsBitWidth = kernel.n0 * kernel.k0 * bitWidth(kernel.rhsType);
+    return fastExactDiv(rhsBitWidth, kernel.registerBitWidth);
+  }
+  // Returns the number of SIMD registers needed for the Accumulator
+  int getAccRegsCount() const {
+    int accBitWidth = kernel.m0 * kernel.n0 * bitWidth(kernel.accType);
+    return fastExactDiv(accBitWidth, kernel.registerBitWidth);
+  }
+  // Returns the MLIR element type (not vector type) of the LHS
+  Type getLhsType() const { return mlirType(context, kernel.lhsType); }
+  // Returns the MLIR element type (not vector type) of the RHS
+  Type getRhsType() const { return mlirType(context, kernel.rhsType); }
+  // Returns the MLIR element type (not vector type) of the Accumulator
+  Type getAccType() const { return mlirType(context, kernel.accType); }
+  // Returns the VectorType of LHS SIMD register vectors
+  VectorType getLhsRegVectorType() const {
+    return VectorType::get(
+        {fastExactDiv(kernel.registerBitWidth.val(), bitWidth(kernel.lhsType))},
+        getLhsType());
+  }
+  // Returns the VectorType of RHS SIMD register vectors
+  VectorType getRhsRegVectorType() const {
+    return VectorType::get(
+        {fastExactDiv(kernel.registerBitWidth.val(), bitWidth(kernel.rhsType))},
+        getRhsType());
+  }
+  // Returns the VectorType of Accumulator SIMD register vectors
+  VectorType getAccRegVectorType() const {
+    return VectorType::get(
+        {fastExactDiv(kernel.registerBitWidth.val(), bitWidth(kernel.accType))},
+        getAccType());
+  }
+
+ private:
+  MLIRContext *context;
+  MMTKernel kernel;
+
+  // Helper for generate(). Asserts sanity of the vector-of-register-vectors.
+  void validateOperands(ArrayRef<Value> lhs, ArrayRef<Value> rhs,
+                        ArrayRef<Value> acc) {
+    auto validate = [](ArrayRef<Value> vals, int expectedSize,
+                       VectorType expectedElemType) {
+      assert(vals.size() == expectedSize);
+      for (const auto &val : vals) {
+        assert(val.getType().dyn_cast<VectorType>() == expectedElemType);
+        (void)val;
+      }
+      (void)expectedSize;
+      (void)expectedElemType;
+    };
+    validate(lhs, getLhsRegsCount(), getLhsRegVectorType());
+    validate(rhs, getRhsRegsCount(), getRhsRegVectorType());
+    validate(acc, getAccRegsCount(), getAccRegVectorType());
+  }
+  // Helper for generateAsmCodeAndConstraints
+  std::string getInlineAsmConstraintForSimdRegister() const {
+    switch (kernel.arch) {
+      case CustomKernelTargetArch::Aarch64:
+        return "w";
+      case CustomKernelTargetArch::None:
+        break;
+    }
+    assert(false && "Unhandled CustomKernelTargetFeature value");
+    return {};
+  }
+  // Helper for generateAsm. Performs some pre-processing of the kernel's
+  // implAsm. Refer to the comment on kernel::implAsm.
+  void generateAsmCodeAndConstraints(std::string &code,
+                                     std::string &constraints) {
+    assert(code.empty());
+    assert(constraints.empty());
+    // The LLVM inline asm syntax is documented here:
+    // https://llvm.org/docs/LangRef.html#inline-assembler-expressions
+    std::vector<std::string> outputConstraints;
+    std::vector<std::string> inputConstraints;
+    std::vector<std::string> tiedInputConstraints;
+    code = kernel.implAsm;
+    int numberedOperand = 0;
+    enum class OperandKind { Input, InputOutput };
+    std::string simdRegConstraint = getInlineAsmConstraintForSimdRegister();
+    auto processOperands = [&](OperandKind kind, int count, const char *name) {
+      for (int i = 0; i < count; ++i) {
+        std::string numberedOperandStr = llvm::itostr(numberedOperand++);
+        std::string match = llvm::formatv("$({0}:{1})", name, i);
+        std::string substitute = std::string("$") + numberedOperandStr;
+        replaceAllSubstrsInPlace(code, match, substitute);
+        if (kind == OperandKind::InputOutput) {
+          outputConstraints.push_back(std::string("=") + simdRegConstraint);
+          tiedInputConstraints.push_back(numberedOperandStr);
+        } else {
+          inputConstraints.push_back(simdRegConstraint);
+        }
+      }
+    };
+    processOperands(OperandKind::InputOutput, getAccRegsCount(), "acc");
+    processOperands(OperandKind::Input, getLhsRegsCount(), "lhs");
+    processOperands(OperandKind::Input, getRhsRegsCount(), "rhs");
+    constraints = llvm::join(outputConstraints, ",") + "," +
+                  llvm::join(inputConstraints, ",") + "," +
+                  llvm::join(tiedInputConstraints, ",");
+  }
+  // Helper for generate(). Implements the asm path.
+  SmallVector<Value> generateAsm(PatternRewriter &rewriter, Location loc,
+                                 ArrayRef<Value> lhs, ArrayRef<Value> rhs,
+                                 ArrayRef<Value> acc) {
+    SmallVector<Value> inputs;
+    // First the input operands. This matches how in the constraints we are
+    // placing the inputConstraints before the tiedInputConstraints, the latter
+    // being the input-output operands.
+    inputs.append(lhs.begin(), lhs.end());
+    inputs.append(rhs.begin(), rhs.end());
+    // Then the input-output operands.
+    inputs.append(acc.begin(), acc.end());
+    // Create the inline asm op.
+    SmallVector<Type> outputOperandTypes(
+        llvm::map_range(acc, [](Value v) { return v.getType(); }));
+    auto returnType =
+        LLVM::LLVMStructType::getLiteral(context, outputOperandTypes);
+    auto dialectAttr =
+        LLVM::AsmDialectAttr::get(context, LLVM::AsmDialect::AD_ATT);
+    std::string code;
+    std::string constraints;
+    generateAsmCodeAndConstraints(code, constraints);
+    LLVM::InlineAsmOp asmOp = rewriter.create<LLVM::InlineAsmOp>(
+        loc, returnType, inputs, code, constraints,
+        /*has_side_effects=*/false, /*is_align_stack=*/false, dialectAttr,
+        /*operand_attrs=*/ArrayAttr());
+    // Extract result vectors from the asm op.
+    SmallVector<Value> resVec;
+    for (int i = 0; i < 16; ++i) {
+      resVec.push_back(rewriter.create<LLVM::ExtractValueOp>(
+          loc, getAccRegVectorType(), asmOp.getRes(),
+          rewriter.getI64ArrayAttr({i})));
+    }
+    return resVec;
+  }
+};
+
+/// Converts matrix-times-matrix-transposed vector.contracts, and possibly also
+/// any type-promotion op (such as arith.extsi) on the input operands, to
+/// a custom kernel (at the moment a llvm.inline_asm op) provided by the
+/// MMTKernel struct.
+///
+/// For example, in the case of a i8*i8->i32 kernel, the IR being replaced
+/// by the llvm.inline_asm op might look like:
 ///
 ///     %lhs_i32 = arith.extsi %lhs_i8 : i8 to i32
 ///     %rhs_i32 = arith.extsi %rhs_i8 : i8 to i32
@@ -196,125 +478,83 @@
 ///                 %acc_i32 : vector<8x8xi32>,
 ///                 [...]
 ///
-/// To vector ops reading directly from the %lhs_i8 and %rhs_i8 values
-/// (bypassing the existing arith.extsi) and passing that to a llvm.inline_asm
-/// block implementing the matrix multiplication arithmetic using Aarch64
-/// dot-product instructions (sdot).
-struct MMT_8x4x8_i8i8i32_Aarch64Dotprod_InlineAsm
-    : OpRewritePattern<vector::ContractionOp> {
-  using OpRewritePattern<vector::ContractionOp>::OpRewritePattern;
+class MMTCustomKernelPattern : public OpRewritePattern<vector::ContractionOp> {
+ private:
+  MMTKernel kernel;
+
+ public:
+  MMTCustomKernelPattern(MLIRContext *context, MMTKernel kernel)
+      : OpRewritePattern<vector::ContractionOp>(context), kernel(kernel) {}
 
   LogicalResult matchAndRewrite(vector::ContractionOp contractionOp,
                                 PatternRewriter &rewriter) const override {
-    // Check if `contractionOp` matches, and obtain the un-promoted i8 input
-    // LHS and RHS vectors, `lhsI8` and `rhsI8`.
-    if (!isMatrixTimesMatrixTransposedOfGivenShape(contractionOp, 8, 4, 8)) {
+    // Check if `contractionOp` matches, and obtain the (un-promoted) input
+    // LHS and RHS vectors.
+    if (!isMatrixTimesMatrixTransposedOfGivenShape(contractionOp, kernel.m0,
+                                                   kernel.k0, kernel.n0)) {
       return failure();
     }
-    Type I8Type = rewriter.getIntegerType(8);
-    Type I32Type = rewriter.getIntegerType(32);
+    MMTKernelGenerator generator(rewriter.getContext(), kernel);
+    Type lhsElemType = generator.getLhsType();
+    Type rhsElemType = generator.getRhsType();
+    Type accElemType = generator.getAccType();
     VectorType accType = contractionOp.acc().getType().cast<VectorType>();
-    if (accType.getElementType() != I32Type) {
+    if (accType.getElementType() != accElemType) {
       return failure();
     }
-    Value lhsI8 = getExtSIInput(I8Type, I32Type, contractionOp.lhs());
-    Value rhsI8 = getExtSIInput(I8Type, I32Type, contractionOp.rhs());
-    if (!lhsI8 || !rhsI8) {
+    Value unpromotedLhs =
+        getUnpromotedInput(lhsElemType, accElemType, contractionOp.lhs());
+    Value unpromotedRhs =
+        getUnpromotedInput(rhsElemType, accElemType, contractionOp.rhs());
+    if (!unpromotedLhs || !unpromotedRhs) {
       return failure();
     }
-
-    // `contractionOp` matches, start rewriting it. We only reference
-    // the `lhsI8` and `rhsI8` values obtained above as the inputs of the
-    // arith.extsi, so this rewrite will leave the existing arith.extsi without
-    // any user (unless something else was using them), so they may be
-    // removed by another transformation.
+    // `contractionOp` matches, start rewriting it.
     Location loc = contractionOp.getLoc();
     // Flatten the inputs to 1D vectors.
-    Value flatLhsI8 = flatten(rewriter, loc, lhsI8);
-    Value flatRhsI8 = flatten(rewriter, loc, rhsI8);
+    Value flatLhs = flatten(rewriter, loc, unpromotedLhs);
+    Value flatRhs = flatten(rewriter, loc, unpromotedRhs);
     Value flatAcc = flatten(rewriter, loc, contractionOp.acc());
-
-    // Create the 1D input vectors of 16 bytes each that are directly what
-    // the target SIMD instructions will want.
-    SmallVector<Value> lhsVec;
-    SmallVector<Value> rhsVec;
-    VectorType vector16xi8Type = VectorType::get({16}, I8Type);
-    for (int position = 0; position < 8 * 4; position += 16) {
-      lhsVec.push_back(
-          extract1DSlice(rewriter, loc, vector16xi8Type, flatLhsI8, position));
-      rhsVec.push_back(
-          extract1DSlice(rewriter, loc, vector16xi8Type, flatRhsI8, position));
-    }
-    SmallVector<Value> accVec;
-    VectorType int32x4Type = VectorType::get({4}, I32Type);
-    for (int position = 0; position < 8 * 8; position += 4) {
-      accVec.push_back(
-          extract1DSlice(rewriter, loc, int32x4Type, flatAcc, position));
-    }
-
-    // Create the inline asm op's operands list.
-    SmallVector<Value> asmOperands;
-    // First the inputs operands.
-    asmOperands.append(lhsVec);
-    asmOperands.append(rhsVec);
-    // Then the input-output operands.
-    asmOperands.append(accVec);
-    SmallVector<Type> asmOutputOperandTypes(
-        llvm::map_range(accVec, [](Value v) { return v.getType(); }));
-
-    // Create the inline asm op.
-    auto returnType = LLVM::LLVMStructType::getLiteral(rewriter.getContext(),
-                                                       asmOutputOperandTypes);
-    auto dialectAttr = LLVM::AsmDialectAttr::get(rewriter.getContext(),
-                                                 LLVM::AsmDialect::AD_ATT);
-    // The LLVM inline asm syntax is documented here:
-    // https://llvm.org/docs/LangRef.html#inline-assembler-expressions
-    LLVM::InlineAsmOp asmOp = rewriter.create<LLVM::InlineAsmOp>(
-        loc, returnType, asmOperands,
-        R"ASM(
-            sdot $0.4s, $18.16b, $16.4b[0]
-            sdot $1.4s, $19.16b, $16.4b[0]
-            sdot $2.4s, $18.16b, $16.4b[1]
-            sdot $3.4s, $19.16b, $16.4b[1]
-            sdot $4.4s, $18.16b, $16.4b[2]
-            sdot $5.4s, $19.16b, $16.4b[2]
-            sdot $6.4s, $18.16b, $16.4b[3]
-            sdot $7.4s, $19.16b, $16.4b[3]
-            sdot $8.4s, $18.16b, $17.4b[0]
-            sdot $9.4s, $19.16b, $17.4b[0]
-            sdot $10.4s, $18.16b, $17.4b[1]
-            sdot $11.4s, $19.16b, $17.4b[1]
-            sdot $12.4s, $18.16b, $17.4b[2]
-            sdot $13.4s, $19.16b, $17.4b[2]
-            sdot $14.4s, $18.16b, $17.4b[3]
-            sdot $15.4s, $19.16b, $17.4b[3]
-          )ASM",
-        "=w,=w,=w,=w,=w,=w,=w,=w,=w,=w,=w,=w,=w,=w,=w,=w,w,w,w,w,0,1,2,3,4,5,6,"
-        "7,8,9,10,11,12,13,14,15",
-        /*has_side_effects=*/false, /*is_align_stack=*/false, dialectAttr);
-
-    // Extract result vectors from the asm op.
-    SmallVector<Value, 16> resVec;
-    for (int i = 0; i < 16; ++i) {
-      resVec.push_back(rewriter.create<LLVM::ExtractValueOp>(
-          loc, int32x4Type, asmOp.getRes(), rewriter.getI64ArrayAttr({i})));
-    }
-
+    // Slice into SIMD-register-sized 1D input vectors ready to feed to the
+    // target SIMD instructions.
+    auto sliceIntoRegVectors = [&](int size, VectorType regVectorType,
+                                   Value src) {
+      SmallVector<Value> regVectors;
+      int regSize = regVectorType.getNumElements();
+      for (int position = 0; position < size; position += regSize) {
+        regVectors.push_back(
+            extract1DSlice(rewriter, loc, regVectorType, src, position));
+      }
+      return regVectors;
+    };
+    VectorType lhsRegVectorType = generator.getLhsRegVectorType();
+    VectorType rhsRegVectorType = generator.getRhsRegVectorType();
+    VectorType accRegVectorType = generator.getAccRegVectorType();
+    const SmallVector<Value> &lhsRegVectors =
+        sliceIntoRegVectors(kernel.m0 * kernel.k0, lhsRegVectorType, flatLhs);
+    const SmallVector<Value> &rhsRegVectors =
+        sliceIntoRegVectors(kernel.n0 * kernel.k0, rhsRegVectorType, flatRhs);
+    const SmallVector<Value> &accRegVectors =
+        sliceIntoRegVectors(kernel.m0 * kernel.n0, accRegVectorType, flatAcc);
+    SmallVector<Value> resRegVectors = generator.generate(
+        rewriter, loc, lhsRegVectors, rhsRegVectors, accRegVectors);
     // Insert the result vectors of size 4 into the overall result vector of
     // size 64, still 1D.
-    VectorType int32x64xType = VectorType::get({64}, I32Type);
+    VectorType flatAccVectorType = flatAcc.getType().cast<VectorType>();
     Value result = rewriter.create<arith::ConstantOp>(
-        loc, int32x64xType, DenseIntElementsAttr::get(int32x64xType, 0));
-    for (int i = 0; i < 16; ++i) {
+        loc, flatAccVectorType,
+        DenseIntElementsAttr::get(flatAccVectorType, 0));
+    int accRegsCount = generator.getAccRegsCount();
+    int accRegNumElements = accRegVectorType.getNumElements();
+    for (int i = 0; i < accRegsCount; ++i) {
       result = rewriter.create<vector::InsertStridedSliceOp>(
-          loc, resVec[i], result, std::array<int64_t, 1>{4 * i},
+          loc, resRegVectors[i], result,
+          std::array<int64_t, 1>{accRegNumElements * i},
           std::array<int64_t, 1>{1});
     }
-
     // Cast the result from 1D to 2D and replace the original vector.contract.
-    VectorType int32x8x8xType = VectorType::get({8, 8}, I32Type);
-    rewriter.replaceOpWithNewOp<vector::ShapeCastOp>(contractionOp,
-                                                     int32x8x8xType, result);
+    rewriter.replaceOpWithNewOp<vector::ShapeCastOp>(contractionOp, accType,
+                                                     result);
     return success();
   }
 };
@@ -356,8 +596,8 @@
       return failure();
     }
 
-    Value inLhs = getExtSIInput(I8Type, I32Type, lhs);
-    Value inRhs = getExtSIInput(I8Type, I32Type, rhs);
+    Value inLhs = getUnpromotedInput(I8Type, I32Type, lhs);
+    Value inRhs = getUnpromotedInput(I8Type, I32Type, rhs);
 
     if (!inLhs || !inRhs) return failure();
 
@@ -439,7 +679,7 @@
  public:
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<vector::VectorDialect, LLVM::LLVMDialect>();
-    if (target_info.intrinsics) {
+    if (target_info.has(CustomKernelTargetFeature::Intrinsics)) {
       registry.insert<arm_neon::ArmNeonDialect>();
     }
   }
@@ -447,9 +687,12 @@
     if (failed(Pass::initializeOptions(options))) {
       return failure();
     }
-    target_info.aarch64 = aarch64;
-    target_info.dotprod = dotprod;
-    target_info.intrinsics = intrinsics;
+    if (failed(ParseCustomKernelsTargetInfo(arch, features, target_info))) {
+      return failure();
+    }
+    if (intrinsics) {
+      target_info.add(CustomKernelTargetFeature::Intrinsics);
+    }
     return success();
   }
   void runOnOperation() override {
@@ -471,11 +714,12 @@
 void populateVectorContractCustomKernelsPatterns(
     const CustomKernelsTargetInfo &target_info, RewritePatternSet &patterns) {
   MLIRContext *context = patterns.getContext();
-  if (target_info.aarch64 && target_info.dotprod) {
-    if (target_info.intrinsics) {
-      patterns.insert<MMT_8x4x8_i8i8i32_Aarch64Dotprod_Intrinsics>(context);
+  if (target_info.has(CustomKernelTargetFeature::Aarch64Dotprod)) {
+    if (target_info.has(CustomKernelTargetFeature::Intrinsics)) {
+      patterns.add<MMT_8x4x8_i8i8i32_Aarch64Dotprod_Intrinsics>(context);
     } else {
-      patterns.insert<MMT_8x4x8_i8i8i32_Aarch64Dotprod_InlineAsm>(context);
+      patterns.add<MMTCustomKernelPattern>(
+          context, MMTKernel_8x4x8_i8i8i32_Aarch64Dotprod_InlineAsm());
     }
   }
 }

diff --git a/iree/compiler/Codegen/LLVMCPU/test/illegal_configuration.mlir b/iree/compiler/Codegen/LLVMCPU/test/illegal_configuration.mlir
index 2275104..3b8f961 100644
--- a/iree/compiler/Codegen/LLVMCPU/test/illegal_configuration.mlir
+++ b/iree/compiler/Codegen/LLVMCPU/test/illegal_configuration.mlir

@@ -20,7 +20,7 @@
         %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<4x8xf32>
         %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<8x16xf32>
         %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<4x16xf32>
-        // expected-error @+1 {{expected 2 entries for workload_per_wg, but got 0}}
+        // expected-error @+1 {{expected two levels of tile sizes for CPUDoubleTilingExpert, got 0}}
         linalg.matmul {lowering.config = #config} ins(%lhs, %rhs : memref<4x8xf32>, memref<8x16xf32>)
           outs(%result: memref<4x16xf32>)
         return
@@ -31,7 +31,7 @@
 
 // -----
 
-#config = #iree_codegen.lowering.config<tile_sizes = [], native_vector_size = []>
+#config = #iree_codegen.lowering.config<tile_sizes = [[], []], native_vector_size = []>
 #translation = #iree_codegen.translation.info<"CPUDoubleTilingExpert", workload_per_wg = [1, 0]>
 #executable_layout = #hal.executable.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
@@ -62,7 +62,7 @@
 
 // -----
 
-#config = #iree_codegen.lowering.config<tile_sizes = [], native_vector_size = []>
+#config = #iree_codegen.lowering.config<tile_sizes = [[], []], native_vector_size = []>
 #translation = #iree_codegen.translation.info<"CPUDoubleTilingExpert", workload_per_wg = [1, 1, 1, 1]>
 #executable_layout = #hal.executable.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
@@ -82,7 +82,7 @@
         %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<4x8xf32>
         %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<8x16xf32>
         %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<4x16xf32>
-        // expected-error @+1 {{workload_per_wg size should be less than 3}}
+        // expected-error @+1 {{workload_per_wg size should be less than or equal to 3}}
         linalg.matmul {lowering.config = #config} ins(%lhs, %rhs : memref<4x8xf32>, memref<8x16xf32>)
           outs(%result: memref<4x16xf32>)
         return

diff --git a/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir b/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
index 6438c9e..92c6790 100644
--- a/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
+++ b/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir

@@ -1448,3 +1448,187 @@
 //  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering.config<tile_sizes = {{\[}}[], [8, 8, 8], [1, 4, 4]{{\]}}, native_vector_size = [1, 4, 4]>
 //  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation.info<"CPUTileFuseAndVectorize", workload_per_wg = [64, 64]>
 //  CHECK:       linalg.matmul {lowering.config = #[[CONFIG]]}
+
+// -----
+
+#executable_layout = #hal.executable.layout<push_constants = 4, sets = [
+  #hal.descriptor_set.layout<0, bindings = [
+    #hal.descriptor_set.binding<0, storage_buffer>,
+    #hal.descriptor_set.binding<1, storage_buffer>,
+    #hal.descriptor_set.binding<2, storage_buffer>
+  ]>
+]>
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<
+  "llvm", "embedded-elf-x86_64", {
+    data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+    native_vector_size = 16 : index,
+    target_triple = "x86_64-unknown-unknown-eabi-elf"
+  }
+>
+#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
+#map1 = affine_map<(d0)[s0, s1] -> (s1, -d0 + s0)>
+hal.executable private @gemm_unit_N {
+  hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
+    hal.executable.entry_point public @gemm_unit_N ordinal(0) layout(#executable_layout)
+    builtin.module  {
+      func @gemm_unit_N() {
+        %c0 = arith.constant 0 : index
+        %0 = hal.interface.constant.load[0] : i32
+        %1 = hal.interface.constant.load[1] : i32
+        %2 = hal.interface.constant.load[2] : i32
+        %3 = hal.interface.constant.load[3] : i32
+        %4 = arith.index_cast %0 : i32 to index
+        %5 = arith.index_cast %1 : i32 to index
+        %6 = arith.index_cast %2 : i32 to index
+        %7 = arith.index_cast %3 : i32 to index
+        %8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(32) : !flow.dispatch.tensor<readonly:?x?xf32>{%4, %5}
+        %9 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(32) : !flow.dispatch.tensor<readonly:?x1xf32>{%6}
+        %10 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(32) : !flow.dispatch.tensor<readwrite:?x1xf32>{%7}
+        %11 = flow.dispatch.tensor.load %9, offsets = [0, 0], sizes = [%6, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:?x1xf32>{%6} -> tensor<?x1xf32>
+        %workgroup_size_x = hal.interface.workgroup.size[0] : index
+        %workgroup_id_x = hal.interface.workgroup.id[0] : index
+        %workgroup_count_x = hal.interface.workgroup.count[0] : index
+        %12 = affine.apply #map0()[%workgroup_id_x, %workgroup_size_x]
+        %13 = affine.apply #map0()[%workgroup_count_x, %workgroup_size_x]
+        scf.for %arg0 = %12 to %4 step %13 {
+          %14 = affine.min #map1(%arg0)[%4, %workgroup_size_x]
+          %15 = flow.dispatch.tensor.load %8, offsets = [%arg0, 0], sizes = [%14, %5], strides = [1, 1] : !flow.dispatch.tensor<readonly:?x?xf32>{%4, %5} -> tensor<?x?xf32>
+          %16 = flow.dispatch.tensor.load %10, offsets = [%arg0, 0], sizes = [%14, 1], strides = [1, 1] : !flow.dispatch.tensor<readwrite:?x1xf32>{%7} -> tensor<?x1xf32>
+          %17 = linalg.matmul ins(%15, %11 : tensor<?x?xf32>, tensor<?x1xf32>) outs(%16 : tensor<?x1xf32>) -> tensor<?x1xf32>
+          flow.dispatch.tensor.store %17, %10, offsets = [%arg0, 0], sizes = [%14, 1], strides = [1, 1] : tensor<?x1xf32> -> !flow.dispatch.tensor<readwrite:?x1xf32>{%7}
+        }
+        return
+      }
+    }
+  }
+}
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering.config<tile_sizes = {{\[}}[], [8, 0, 16]], native_vector_size = []>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation.info<"CPUDoubleTilingExpert", workload_per_wg = [64]>
+//  CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0] -> (s0 ceildiv 64)>
+//      CHECK: hal.executable.entry_point public @gemm_unit_N
+// CHECK-SAME:     translation.info = #[[TRANSLATION]]
+//      CHECK:   ^{{[a-z0-9]+}}
+// CHECK-SAME:       %[[ARG0:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:       %[[ARG1:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:       %[[ARG2:[a-zA-Z0-9]+]]: index
+//  CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+//  CHECK-DAG:     %[[N0:.+]] = affine.apply #[[MAP]]()[%[[ARG0]]]
+//      CHECK:     hal.return %[[N0]], %[[C1]], %[[C1]]
+//      CHECK:   linalg.matmul
+// CHECK-SAME:       lowering.config = #[[CONFIG]]
+
+// -----
+
+#executable_layout = #hal.executable.layout<push_constants = 4, sets = [
+  #hal.descriptor_set.layout<0, bindings = [
+    #hal.descriptor_set.binding<0, storage_buffer>,
+    #hal.descriptor_set.binding<1, storage_buffer>,
+    #hal.descriptor_set.binding<2, storage_buffer>
+  ]>
+]>
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<
+  "llvm", "embedded-elf-x86_64", {
+    data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+    native_vector_size = 16 : index,
+    target_triple = "x86_64-unknown-unknown-eabi-elf"
+  }
+>
+hal.executable private @gemm_unit_M_unit_N {
+  hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
+    hal.executable.entry_point public @gemm_unit_M_unit_N ordinal(0) layout(#executable_layout)
+    builtin.module  {
+      func @gemm_unit_M_unit_N() {
+        %c0 = arith.constant 0 : index
+        %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(32) : !flow.dispatch.tensor<readonly:1x1xf32>
+        %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(32) : !flow.dispatch.tensor<readonly:1x1xf32>
+        %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(32) : !flow.dispatch.tensor<readwrite:1x1xf32>
+        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x1xf32> -> tensor<1x1xf32>
+        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x1xf32> -> tensor<1x1xf32>
+        %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readwrite:1x1xf32> -> tensor<1x1xf32>
+        %6 = linalg.matmul ins(%3, %4 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%5 : tensor<1x1xf32>) -> tensor<1x1xf32>
+        flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : tensor<1x1xf32> -> !flow.dispatch.tensor<readwrite:1x1xf32>
+        return
+      }
+    }
+  }
+}
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering.config<tile_sizes = {{\[}}[], [0, 0, 0]], native_vector_size = []>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation.info<"CPUDoubleTilingExpert", workload_per_wg = []>
+//      CHECK: hal.executable.entry_point public @gemm_unit_M_unit_N
+// CHECK-SAME:     translation.info = #[[TRANSLATION]]
+//      CHECK:   ^{{[a-z0-9]+}}
+// CHECK-SAME:       %[[ARG0:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:       %[[ARG1:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:       %[[ARG2:[a-zA-Z0-9]+]]: index
+//  CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+//      CHECK:     hal.return %[[C1]], %[[C1]], %[[C1]]
+//      CHECK:   linalg.matmul
+// CHECK-SAME:       lowering.config = #[[CONFIG]]
+
+// -----
+
+#executable_layout = #hal.executable.layout<push_constants = 4, sets = [
+  #hal.descriptor_set.layout<0, bindings = [
+    #hal.descriptor_set.binding<0, storage_buffer>,
+    #hal.descriptor_set.binding<1, storage_buffer>,
+    #hal.descriptor_set.binding<2, storage_buffer>
+  ]>
+]>
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<
+  "llvm", "embedded-elf-x86_64", {
+    data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+    native_vector_size = 16 : index,
+    target_triple = "x86_64-unknown-unknown-eabi-elf"
+  }
+>
+#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
+#map1 = affine_map<(d0)[s0, s1] -> (s1, -d0 + s0)>
+hal.executable private @gemm_unit_M {
+  hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
+    hal.executable.entry_point public @gemm_unit_M ordinal(0) layout(#executable_layout)
+    builtin.module  {
+      func @gemm_unit_M() {
+        %c0 = arith.constant 0 : index
+        %0 = hal.interface.constant.load[0] : i32
+        %1 = hal.interface.constant.load[1] : i32
+        %2 = hal.interface.constant.load[2] : i32
+        %3 = hal.interface.constant.load[3] : i32
+        %4 = arith.index_cast %0 : i32 to index
+        %5 = arith.index_cast %1 : i32 to index
+        %6 = arith.index_cast %2 : i32 to index
+        %7 = arith.index_cast %3 : i32 to index
+        %8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(32) : !flow.dispatch.tensor<readonly:?x?xf32>{%4, %5}
+        %9 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(32) : !flow.dispatch.tensor<readonly:1x?xf32>{%6}
+        %10 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(32) : !flow.dispatch.tensor<readwrite:1x?xf32>{%7}
+        %11 = flow.dispatch.tensor.load %9, offsets = [0, 0], sizes = [1, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:1x?xf32>{%6} -> tensor<1x?xf32>
+        %workgroup_size_x = hal.interface.workgroup.size[0] : index
+        %workgroup_id_x = hal.interface.workgroup.id[0] : index
+        %workgroup_count_x = hal.interface.workgroup.count[0] : index
+        %12 = affine.apply #map0()[%workgroup_id_x, %workgroup_size_x]
+        %13 = affine.apply #map0()[%workgroup_count_x, %workgroup_size_x]
+        scf.for %arg0 = %12 to %5 step %13 {
+          %14 = affine.min #map1(%arg0)[%5, %workgroup_size_x]
+          %15 = flow.dispatch.tensor.load %8, offsets = [0, %arg0], sizes = [%4, %14], strides = [1, 1] : !flow.dispatch.tensor<readonly:?x?xf32>{%4, %5} -> tensor<?x?xf32>
+          %16 = flow.dispatch.tensor.load %10, offsets = [0, %arg0], sizes = [1, %14], strides = [1, 1] : !flow.dispatch.tensor<readwrite:1x?xf32>{%7} -> tensor<1x?xf32>
+          %17 = linalg.matmul ins(%11, %15 : tensor<1x?xf32>, tensor<?x?xf32>) outs(%16 : tensor<1x?xf32>) -> tensor<1x?xf32>
+          flow.dispatch.tensor.store %17, %10, offsets = [0, %arg0], sizes = [1, %14], strides = [1, 1] : tensor<1x?xf32> -> !flow.dispatch.tensor<readwrite:1x?xf32>{%7}
+        }
+        return
+      }
+    }
+  }
+}
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering.config<tile_sizes = {{\[}}[], [0, 32, 16]], native_vector_size = []>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation.info<"CPUDoubleTilingExpert", workload_per_wg = [64]>
+//  CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0] -> (s0 ceildiv 64)>
+//      CHECK: hal.executable.entry_point public @gemm_unit_M
+// CHECK-SAME:     translation.info = #[[TRANSLATION]]
+//      CHECK:   ^{{[a-z0-9]+}}
+// CHECK-SAME:       %[[ARG0:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:       %[[ARG1:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:       %[[ARG2:[a-zA-Z0-9]+]]: index
+//  CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+//  CHECK-DAG:     %[[N0:.+]] = affine.apply #[[MAP]]()[%[[ARG0]]]
+//      CHECK:     hal.return %[[N0]], %[[C1]], %[[C1]]
+//      CHECK:   linalg.matmul
+// CHECK-SAME:       lowering.config = #[[CONFIG]]

diff --git a/iree/compiler/Codegen/LLVMCPU/test/vector_contract_to_arm_asm.mlir b/iree/compiler/Codegen/LLVMCPU/test/vector_contract_to_arm_asm.mlir
index db29930..2689781 100644
--- a/iree/compiler/Codegen/LLVMCPU/test/vector_contract_to_arm_asm.mlir
+++ b/iree/compiler/Codegen/LLVMCPU/test/vector_contract_to_arm_asm.mlir

@@ -1,4 +1,4 @@
-// RUN: iree-opt -iree-llvmcpu-vector-contract-custom-kernels='aarch64 dotprod' %s | FileCheck %s
+// RUN: iree-opt -iree-llvmcpu-vector-contract-custom-kernels='arch=aarch64 features=+dotprod' %s | FileCheck %s
 
 func @mmt_8x4x8_i8i8i32_aarch64_dotprod_inline_asm(
     %lhs: vector<8x4xi8>,

diff --git a/iree/compiler/Codegen/LLVMCPU/test/vector_contract_to_arm_intrinsics.mlir b/iree/compiler/Codegen/LLVMCPU/test/vector_contract_to_arm_intrinsics.mlir
index 9193ca9..360bcd3 100644
--- a/iree/compiler/Codegen/LLVMCPU/test/vector_contract_to_arm_intrinsics.mlir
+++ b/iree/compiler/Codegen/LLVMCPU/test/vector_contract_to_arm_intrinsics.mlir

@@ -1,4 +1,4 @@
-// RUN: iree-opt -iree-llvmcpu-vector-contract-custom-kernels='aarch64 dotprod intrinsics' %s | FileCheck %s
+// RUN: iree-opt -iree-llvmcpu-vector-contract-custom-kernels='arch=aarch64 features=+dotprod intrinsics' %s | FileCheck %s
 
 // CHECK-LABEL: @vector_i8i8i32matmul(
 // CHECK-SAME:          %[[LHS:[^:[:space:]]+]]

diff --git a/iree/compiler/Codegen/LLVMGPU/BUILD b/iree/compiler/Codegen/LLVMGPU/BUILD
index 2d5293c..3b632c3 100644
--- a/iree/compiler/Codegen/LLVMGPU/BUILD
+++ b/iree/compiler/Codegen/LLVMGPU/BUILD

@@ -75,5 +75,6 @@
         "@llvm-project//mlir:VectorToGPU",
         "@llvm-project//mlir:VectorToLLVM",
         "@llvm-project//mlir:VectorToSCF",
+        "@llvm-project//mlir:VectorTransforms",
     ],
 )

diff --git a/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt b/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt
index dfdc0cd..b7cd29a 100644
--- a/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt
+++ b/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt

@@ -66,6 +66,7 @@
     MLIRVectorToGPU
     MLIRVectorToLLVM
     MLIRVectorToSCF
+    MLIRVectorTransforms
     iree::compiler::Codegen::Common
     iree::compiler::Codegen::Dialect::IREECodegenDialect
     iree::compiler::Codegen::PassHeaders

diff --git a/iree/compiler/Codegen/LLVMGPU/ConvertToLLVM.cpp b/iree/compiler/Codegen/LLVMGPU/ConvertToLLVM.cpp
index a8e77bd..49d2a35 100644
--- a/iree/compiler/Codegen/LLVMGPU/ConvertToLLVM.cpp
+++ b/iree/compiler/Codegen/LLVMGPU/ConvertToLLVM.cpp

@@ -18,7 +18,7 @@
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
 #include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/Dialect/Vector/VectorOps.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir {

diff --git a/iree/compiler/Codegen/LLVMGPU/ConvertToNVVM.cpp b/iree/compiler/Codegen/LLVMGPU/ConvertToNVVM.cpp
index fb38835..923e3d4 100644
--- a/iree/compiler/Codegen/LLVMGPU/ConvertToNVVM.cpp
+++ b/iree/compiler/Codegen/LLVMGPU/ConvertToNVVM.cpp

@@ -21,7 +21,7 @@
 #include "mlir/Dialect/GPU/Passes.h"
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/Dialect/Vector/VectorOps.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
 

diff --git a/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp b/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp
index 8555f81..225f671 100644
--- a/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp
+++ b/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp

@@ -21,7 +21,7 @@
 #include "mlir/Dialect/GPU/Passes.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/Dialect/Vector/VectorOps.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
 

diff --git a/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
index 4e0bdb8..514019a 100644
--- a/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+++ b/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp

@@ -99,9 +99,41 @@
                          IREE::Codegen::DispatchLoweringPassPipeline pipeline) {
         TileSizesListType tileSizes;
         SmallVector<int64_t> ts;
+        SmallVector<unsigned> partitionedLoops =
+            cast<IREE::Flow::PartitionableLoopsInterface>(op.getOperation())
+                .getPartitionableLoops(kNumMaxParallelDims);
+        unsigned index = 0;
         // Tile all the higher parallel dimension with a size of 1 and the 2
         // most inner dimension with the tileX/tileY size.
-        ts.append(op.getNumParallelLoops() - 2, 1);
+        for (auto loopNum :
+             llvm::seq<unsigned>(0, op.getNumParallelLoops() - 2)) {
+          int64_t tileSize = 0;
+          if (index < partitionedLoops.size() &&
+              partitionedLoops[index] == loopNum) {
+            tileSize = 1;
+            index++;
+          }
+          ts.push_back(tileSize);
+        }
+
+        // Check for M loop being partitioned.
+        if (index < partitionedLoops.size() &&
+            partitionedLoops[index] == op.getNumParallelLoops() - 2) {
+          index++;
+        } else {
+          // M dim isnt partitioned.
+          tileX = 0;
+        }
+
+        // Check for N loop being partitioned.
+        if (index < partitionedLoops.size() &&
+            partitionedLoops[index] == op.getNumParallelLoops() - 1) {
+          index++;
+        } else {
+          // N dim isnt partitioned.
+          tileY = 0;
+        }
+
         ts.append({tileX, tileY});
         // Tile all the reduction dimensions.
         ts.append(op.getNumReductionLoops(), tileK);
@@ -196,7 +228,9 @@
 
 static LogicalResult setFftConfig(FuncOp entryPoint,
                                   IREE::LinalgExt::FftOp op) {
-  auto partitionedLoops = getPartitionedLoops(op);
+  auto interfaceOp = cast<IREE::Flow::PartitionableLoopsInterface>(*op);
+  auto partitionedLoops =
+      interfaceOp.getPartitionableLoops(kNumMaxParallelDims);
   unsigned loopDepth = partitionedLoops.back() + 1;
   SmallVector<int64_t> workgroupTileSize(loopDepth, 0);
   SmallVector<int64_t, 3> workgroupSize = {cudaWarpSize, 1, 1};
@@ -224,7 +258,9 @@
 
 static LogicalResult setSortConfig(FuncOp entryPoint, Operation *op) {
   TileSizesListType tileSizes;
-  SmallVector<unsigned> partitionedLoops = getPartitionedLoops(op);
+  auto interfaceOp = cast<IREE::Flow::PartitionableLoopsInterface>(*op);
+  auto partitionedLoops =
+      interfaceOp.getPartitionableLoops(kNumMaxParallelDims);
   if (partitionedLoops.empty()) {
     tileSizes.push_back({});
     return setOpConfigAndEntryPointFnTranslation(
@@ -264,7 +300,9 @@
   IREE::Codegen::DispatchLoweringPassPipeline passPipeline =
       IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUDistribute;
   TileSizesListType tileSizes;
-  SmallVector<unsigned> partitionedLoops = getPartitionedLoops(op);
+  auto interfaceOp = cast<IREE::Flow::PartitionableLoopsInterface>(*op);
+  auto partitionedLoops =
+      interfaceOp.getPartitionableLoops(kNumMaxParallelDims);
   if (partitionedLoops.empty()) {
     tileSizes.push_back({});
     return setOpConfigAndEntryPointFnTranslation(
@@ -413,10 +451,10 @@
     }
 
     Operation *rootOperation = nullptr;
-    // Find the root operation. linalg.generic, linalg.fill and linalg.copy are
-    // not root operations if there are other compute operations present.
+    // Find the root operation. linalg.generic and linalg.fill are not root
+    // operations if there are other compute operations present.
     for (Operation *op : llvm::reverse(computeOps)) {
-      if (!isa<linalg::GenericOp, linalg::FillOp, linalg::CopyOp>(op)) {
+      if (!isa<linalg::GenericOp, linalg::FillOp>(op)) {
         rootOperation = op;
         break;
       }
@@ -431,7 +469,7 @@
 
     if (!rootOperation) {
       for (Operation *op : llvm::reverse(computeOps)) {
-        if (isa<linalg::GenericOp, linalg::FillOp, linalg::CopyOp>(op)) {
+        if (isa<linalg::GenericOp, linalg::FillOp>(op)) {
           rootOperation = op;
           break;
         }

diff --git a/iree/compiler/Codegen/LLVMGPU/LLVMGPUDistributeSharedMemoryCopy.cpp b/iree/compiler/Codegen/LLVMGPU/LLVMGPUDistributeSharedMemoryCopy.cpp
index 04f6db5..88b8457 100644
--- a/iree/compiler/Codegen/LLVMGPU/LLVMGPUDistributeSharedMemoryCopy.cpp
+++ b/iree/compiler/Codegen/LLVMGPU/LLVMGPUDistributeSharedMemoryCopy.cpp

@@ -14,7 +14,7 @@
 #include "iree/compiler/Codegen/Transforms/Transforms.h"
 #include "iree/compiler/Codegen/Utils/MarkerUtils.h"
 #include "mlir/Dialect/GPU/Passes.h"
-#include "mlir/Dialect/Vector/VectorTransforms.h"
+#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Support/MathExtras.h"
@@ -41,8 +41,11 @@
         // We tile to 4 as we want each thread to load 4 element in a cyclic
         // distribution.
         SmallVector<Value, 4> tileSizesVal;
-        unsigned rank =
-            cast<linalg::CopyOp>(operation).getOutputBufferTypes()[0].getRank();
+        unsigned rank = cast<linalg::GenericOp>(operation)
+                            .getOperand(0)
+                            .getType()
+                            .cast<MemRefType>()
+                            .getRank();
         for (unsigned i = 0; i < rank - 1; i++) {
           int64_t t = (rank - i) <= kNumGPUDims ? 1 : 0;
           tileSizesVal.push_back(
@@ -70,7 +73,8 @@
           .setTileSizeComputationFunction(wgCopyTileSizeFn)
           .setDistributionOptions(copyInvocationDistributionOptions);
   patterns.insert<linalg::LinalgTilingPattern>(
-      linalg::CopyOp::getOperationName(), patterns.getContext(), tilingOptions,
+      linalg::GenericOp::getOperationName(), patterns.getContext(),
+      tilingOptions,
       linalg::LinalgTransformationFilter(
           {StringAttr::get(patterns.getContext(),
                            getCopyToWorkgroupMemoryMarker())},
@@ -78,7 +82,7 @@
 }
 
 static void populateVectorizationPatterns(RewritePatternSet &patterns) {
-  linalg::VectorizationPatterns<linalg::CopyOp>::insert(
+  linalg::VectorizationPatterns<linalg::GenericOp>::insert(
       patterns, linalg::LinalgVectorizationOptions(),
       linalg::LinalgTransformationFilter(StringAttr::get(
           patterns.getContext(), getCopyToWorkgroupMemoryMarker())));
@@ -194,7 +198,7 @@
     : public LLVMGPUDistributeSharedMemoryCopyBase<
           LLVMGPUDistributeSharedMemoryCopyPass> {
   void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<vector::VectorDialect>();
+    registry.insert<vector::VectorDialect, scf::SCFDialect>();
   }
   void runOnOperation() override {
     FuncOp funcOp = getOperation();
@@ -203,8 +207,8 @@
     auto workgroupSize = getWorkgroupSize(entryPointOp);
     workgroupSize.resize(3, 1);
     MLIRContext *context = &getContext();
-    SmallVector<linalg::CopyOp> copiesToWorkgroupMem;
-    funcOp.walk([&](linalg::CopyOp copyOp) {
+    SmallVector<linalg::GenericOp> copiesToWorkgroupMem;
+    funcOp.walk([&](linalg::GenericOp copyOp) {
       if (hasMarker(copyOp, getCopyToWorkgroupMemoryMarker()))
         copiesToWorkgroupMem.push_back(copyOp);
     });
@@ -212,8 +216,9 @@
     int64_t flatWorkgroupSize =
         workgroupSize[0] * workgroupSize[1] * workgroupSize[2];
     bool isAligned = llvm::all_of(
-        copiesToWorkgroupMem, [flatWorkgroupSize](linalg::CopyOp copyOp) {
-          auto shape = copyOp.output().getType().cast<MemRefType>().getShape();
+        copiesToWorkgroupMem, [flatWorkgroupSize](linalg::GenericOp copyOp) {
+          auto shape =
+              copyOp.getOperand(0).getType().cast<MemRefType>().getShape();
           // Verify that each dimension of the shape can be distributed on the
           // threads
           int64_t threadsAvailable = flatWorkgroupSize;

diff --git a/iree/compiler/Codegen/LLVMGPU/LLVMGPUPipelining.cpp b/iree/compiler/Codegen/LLVMGPU/LLVMGPUPipelining.cpp
index 88efad5..c5fce48 100644
--- a/iree/compiler/Codegen/LLVMGPU/LLVMGPUPipelining.cpp
+++ b/iree/compiler/Codegen/LLVMGPU/LLVMGPUPipelining.cpp

@@ -9,7 +9,7 @@
 #include "iree/compiler/Codegen/Utils/Utils.h"
 #include "mlir/Dialect/GPU/Passes.h"
 #include "mlir/Dialect/SCF/Transforms.h"
-#include "mlir/Dialect/Vector/VectorOps.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 //====---------------------------------------------------------------------===//

diff --git a/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp b/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp
index 2aec72d..2284cd6 100644
--- a/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp
+++ b/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp

@@ -9,8 +9,8 @@
 #include "iree/compiler/Codegen/Passes.h"
 #include "iree/compiler/Codegen/Utils/MarkerUtils.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
-#include "mlir/Dialect/Vector/VectorOps.h"
-#include "mlir/Dialect/Vector/VectorTransforms.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
 
@@ -25,8 +25,8 @@
   linalg::LinalgVectorizationOptions opt;
   linalg::LinalgTransformationFilter f(
       StringAttr::get(patterns.getContext(), getVectorizeMarker()));
-  linalg::VectorizationPatterns<linalg::FillOp, linalg::CopyOp,
-                                linalg::GenericOp>::insert(patterns, opt, f);
+  linalg::VectorizationPatterns<linalg::FillOp, linalg::GenericOp>::insert(
+      patterns, opt, f);
   patterns.add<linalg::LinalgVectorizationPattern>(
       patterns.getContext(), f.addOpFilter<linalg::ContractionOpInterface>(),
       opt);

diff --git a/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
index d021f7b..f215ae9 100644
--- a/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
+++ b/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp

@@ -35,7 +35,9 @@
 static void populateTilingReductionPatterns(RewritePatternSet &patterns) {
   auto tileSizesFn = [&](OpBuilder &builder,
                          Operation *op) -> SmallVector<Value, 4> {
-    SmallVector<unsigned> partitionedLoops = getPartitionedLoops(op);
+    auto interfaceOp = cast<IREE::Flow::PartitionableLoopsInterface>(*op);
+    auto partitionedLoops =
+        interfaceOp.getPartitionableLoops(kNumMaxParallelDims);
     SmallVector<Value, 4> tileSizes = getTileSizes(builder, op, 0);
     auto zero = builder.create<arith::ConstantIndexOp>(op->getLoc(), 0);
     for (unsigned depth : partitionedLoops) {
@@ -77,7 +79,10 @@
         }
         std::reverse(tileSizes.begin(), tileSizes.end());
         if (tileSizes.empty()) return SmallVector<Value, 4>();
-        SmallVector<unsigned> partitionedLoops = getPartitionedLoops(operation);
+        auto interfaceOp =
+            cast<IREE::Flow::PartitionableLoopsInterface>(*operation);
+        auto partitionedLoops =
+            interfaceOp.getPartitionableLoops(kNumMaxParallelDims);
         unsigned maxDepth = partitionedLoops.back() + 1;
         auto zero =
             builder.create<arith::ConstantIndexOp>(operation->getLoc(), 0);
@@ -112,7 +117,7 @@
        StringAttr::get(context, getWorkgroupMemoryMarker())},
       StringAttr::get(context, getVectorizeMarker()));
   filter.setMatchByDefault();
-  linalg::TilingPatterns<linalg::MatmulOp, linalg::FillOp, linalg::CopyOp,
+  linalg::TilingPatterns<linalg::MatmulOp, linalg::FillOp,
                          linalg::BatchMatmulOp,
                          linalg::GenericOp>::insert(patterns, tilingOptions,
                                                     filter);
@@ -132,7 +137,13 @@
         }
         std::reverse(tileSizes.begin(), tileSizes.end());
         if (tileSizes.empty()) return SmallVector<Value, 4>();
-        SmallVector<unsigned> partitionedLoops = getPartitionedLoops(operation);
+        auto interfaceOp =
+            cast<IREE::Flow::PartitionableLoopsInterface>(*operation);
+        auto partitionedLoops =
+            interfaceOp.getPartitionableLoops(kNumMaxParallelDims);
+        if (partitionedLoops.empty()) {
+          return tileSizesVal;
+        }
         unsigned maxDepth = partitionedLoops.back() + 1;
         auto zero =
             builder.create<arith::ConstantIndexOp>(operation->getLoc(), 0);
@@ -174,7 +185,7 @@
      return success(!isa<IREE::LinalgExt::FftOp>(op));
    }).setMatchByDefault();
   linalg::TilingPatterns<
-      linalg::MatmulOp, linalg::FillOp, linalg::CopyOp, linalg::BatchMatmulOp,
+      linalg::MatmulOp, linalg::FillOp, linalg::BatchMatmulOp,
       linalg::GenericOp, linalg::Conv2DNhwcHwcfOp,
       linalg::DepthwiseConv2DNhwcHwcOp, linalg::DepthwiseConv2DNhwcHwcmOp,
       linalg::PoolingNhwcMaxOp, linalg::PoolingNhwcMinOp,
@@ -184,7 +195,7 @@
 }
 
 static LogicalResult copyToWorkgroupMemory(OpBuilder &b, Value src, Value dst) {
-  auto copyOp = b.create<linalg::CopyOp>(src.getLoc(), src, dst);
+  Operation *copyOp = createLinalgCopyOp(b, src.getLoc(), src, dst);
   setMarker(copyOp, getCopyToWorkgroupMemoryMarker());
   return success();
 }
@@ -192,17 +203,12 @@
 static Optional<Value> allocateWorkgroupMemory(
     OpBuilder &b, memref::SubViewOp subview,
     ArrayRef<Value> boundingSubViewSize, DataLayout &layout) {
-  // In CUDA workgroup memory is represented by a global variable. Create a
-  // global variable and a memref.GetGlobalOp at the beginning of the function
-  // to get the memref.
   OpBuilder::InsertionGuard guard(b);
   FuncOp funcOp = subview->getParentOfType<FuncOp>();
   if (!funcOp) {
     subview.emitError("expected op to be within std.func");
     return llvm::None;
   }
-  ModuleOp moduleOp = funcOp->getParentOfType<ModuleOp>();
-  SymbolTable symbolTable(moduleOp);
 
   // The bounding subview size is expected to be constant. This specified the
   // shape of the allocation.
@@ -216,18 +222,8 @@
   MemRefType allocType =
       MemRefType::get(shape, subview.getType().getElementType(), {},
                       gpu::GPUDialect::getWorkgroupAddressSpace());
-  b.setInsertionPoint(&moduleOp.front());
-  auto global = b.create<memref::GlobalOp>(
-      funcOp.getLoc(), "__shared_memory__",
-      /*sym_visibility=*/b.getStringAttr("private"),
-      /*type=*/allocType,
-      /*initial_value=*/ElementsAttr(),
-      /*constant=*/false, /*alignment=*/IntegerAttr());
-  symbolTable.insert(global);
-
-  b.setInsertionPointToStart(&(*funcOp.getBody().begin()));
-  Value buffer = b.create<memref::GetGlobalOp>(funcOp.getLoc(), global.type(),
-                                               global.getName());
+  b.setInsertionPoint(&funcOp.front(), funcOp.front().begin());
+  Value buffer = b.create<memref::AllocOp>(funcOp.getLoc(), allocType);
   return buffer;
 }
 
@@ -316,7 +312,7 @@
       // Insert barriers before and after copies to workgroup memory and skip
       // insert barriers between back to back copy to workgroup memory.
       OpBuilder builder(&getContext());
-      funcOp.walk([&builder](linalg::CopyOp copyOp) {
+      funcOp.walk([&builder](linalg::GenericOp copyOp) {
         if (hasMarker(copyOp, getCopyToWorkgroupMemoryMarker())) {
           Operation *prevOp = copyOp->getPrevNode();
           if (!prevOp || !hasMarker(prevOp, getCopyToWorkgroupMemoryMarker())) {

diff --git a/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorLowering.cpp b/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorLowering.cpp
index 55ef920..a6495d1 100644
--- a/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorLowering.cpp
+++ b/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorLowering.cpp

@@ -8,7 +8,7 @@
 #include "iree/compiler/Codegen/Passes.h"
 #include "mlir/Conversion/VectorToSCF/VectorToSCF.h"
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
-#include "mlir/Dialect/Vector/VectorTransforms.h"
+#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
 

diff --git a/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorization.cpp b/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorization.cpp
index 9900f17..79df007 100644
--- a/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorization.cpp
+++ b/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorization.cpp

@@ -12,8 +12,8 @@
 #include "mlir/Conversion/VectorToSCF/VectorToSCF.h"
 #include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
-#include "mlir/Dialect/Vector/VectorOps.h"
-#include "mlir/Dialect/Vector/VectorTransforms.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
 
@@ -31,8 +31,9 @@
   linalg::LinalgVectorizationOptions opt;
   linalg::LinalgTransformationFilter f(
       StringAttr::get(ctx, getVectorizeMarker()));
-  linalg::VectorizationPatterns<linalg::FillOp, linalg::CopyOp,
-                                linalg::GenericOp>::insert(patterns, opt, f);
+  linalg::VectorizationPatterns<linalg::FillOp, linalg::GenericOp>::insert(
+      patterns, opt, f);
+  patterns.add<linalg::CopyVectorizationPattern>(ctx);
   patterns.add<linalg::LinalgVectorizationPattern>(
       ctx, f.addOpFilter<linalg::ContractionOpInterface>(), opt);
   vector::populateVectorTransferPermutationMapLoweringPatterns(patterns);

diff --git a/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/iree/compiler/Codegen/LLVMGPU/Passes.cpp
index c54e673..327259b 100644
--- a/iree/compiler/Codegen/LLVMGPU/Passes.cpp
+++ b/iree/compiler/Codegen/LLVMGPU/Passes.cpp

@@ -134,12 +134,13 @@
   pm.addNestedPass<FuncOp>(IREE::LinalgExt::createLinalgExtToLoopsPass());
 
   // Linalg -> SCF
+  pm.addNestedPass<FuncOp>(createMemrefCopyToLinalgPass());
   pm.addNestedPass<FuncOp>(createConvertLinalgToLoopsPass());
   pm.addNestedPass<FuncOp>(createCanonicalizerPass());
   pm.addNestedPass<FuncOp>(createCSEPass());
 
   // Handled tensor-type constants.
-  pm.addPass(createTensorConstantBufferizePass());
+  pm.addPass(arith::createConstantBufferizePass());
   pm.addPass(createFoldTensorExtractOpPass());
 
   pm.addNestedPass<FuncOp>(createLLVMGPUVectorLoweringPass());

diff --git a/iree/compiler/Codegen/LLVMGPU/test/distribute_to_thread.mlir b/iree/compiler/Codegen/LLVMGPU/test/distribute_to_thread.mlir
index ee6452c..09c15f7 100644
--- a/iree/compiler/Codegen/LLVMGPU/test/distribute_to_thread.mlir
+++ b/iree/compiler/Codegen/LLVMGPU/test/distribute_to_thread.mlir

@@ -62,20 +62,18 @@
 
 //   CHECK-LABEL: hal.executable private @dot_dispatch_0
 //         CHECK:   hal.executable.variant public @cuda
-//         CHECK:  memref.global "private" @{{.*}} : memref<4x256xf32, 3>
-//         CHECK:  memref.global "private" @{{.*}} : memref<2x4xf32, 3>
 //     CHECK-DAG:  %[[C0:.+]] = arith.constant 0 : index
 //     CHECK-DAG:  %[[C2:.+]] = arith.constant 2 : index
 //     CHECK-DAG:  %[[C4:.+]] = arith.constant 4 : index
 //     CHECK-DAG:  %[[C256:.+]] = arith.constant 256 : index
 //     CHECK-DAG:  %[[C1024:.+]] = arith.constant 1024 : index
-//     CHECK-DAG:  %[[BUFFER0:.+]] = memref.get_global @__shared_memory___0 : memref<4x256xf32, 3>
-//     CHECK-DAG:  %[[BUFFER1:.+]] = memref.get_global @__shared_memory__ : memref<2x4xf32, 3>
+//     CHECK-DAG:  %[[BUFFER0:.+]] = memref.alloc() : memref<4x256xf32, 3>
+//     CHECK-DAG:  %[[BUFFER1:.+]] = memref.alloc() : memref<2x4xf32, 3>
 //         CHECK:  scf.for %[[K:.+]] = %[[C0]] to %[[C1024]] step %[[C4]] {
 //         CHECK:    gpu.barrier
-//         CHECK:    linalg.copy(%{{.*}}, %{{.*}}) {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<2x4xf32, #{{.*}}>, memref<2x4xf32, 3>
+//         CHECK:    linalg.generic {{.*}} ins(%{{.*}} : memref<2x4xf32, #{{.*}}>) outs(%{{.*}} : memref<2x4xf32, 3>) attrs = {__internal_linalg_transform__ = "copy_to_workgroup_memory"}
 //     CHECK-NOT:    gpu.barrier
-//         CHECK:    linalg.copy(%{{.*}}, %{{.*}}) {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<4x256xf32, #{{.*}}>, memref<4x256xf32, 3>
+//         CHECK:    linalg.generic {{.*}} ins(%{{.*}} : memref<4x256xf32, #{{.*}}>) outs(%{{.*}} : memref<4x256xf32, 3>) attrs = {__internal_linalg_transform__ = "copy_to_workgroup_memory"}
 //         CHECK:    gpu.barrier
 //         CHECK:    scf.for %[[IND0:.+]] = %{{.*}} to %[[C2]] step %[[C2]] {
 //         CHECK:      scf.for %[[IND1:.+]] = %{{.*}} to %[[C256]] step %[[C256]] {

diff --git a/iree/compiler/Codegen/LLVMGPU/test/distribute_wg_copy.mlir b/iree/compiler/Codegen/LLVMGPU/test/distribute_wg_copy.mlir
index a71cdac..8bc4f5d 100644
--- a/iree/compiler/Codegen/LLVMGPU/test/distribute_wg_copy.mlir
+++ b/iree/compiler/Codegen/LLVMGPU/test/distribute_wg_copy.mlir

@@ -7,6 +7,8 @@
 // CHECK-DAG: #[[$MAP4:.*]] = affine_map<()[s0, s1, s2] -> (s0 + s1 * 32 + s2 * 128 + 128)>
 // CHECK-DAG: #[[$MAP5:.*]] = affine_map<()[s0, s1, s2] -> (s0 * 4 + s1 * 128 + s2 * 512)>
 
+#map0 = affine_map<()[s0, s1, s2] -> (s0 * 4 + s1 * 128 + s2 * 512)>
+
 #executable_layout = #hal.executable.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
     #hal.descriptor_set.binding<0, storage_buffer>,
@@ -25,6 +27,9 @@
     // CHECK-LABEL: @shared_mem_cpy(
       builtin.func @shared_mem_cpy(
         %m0 : memref<64x16xf32>, %m1 : memref<256x4xf32>, %m2 : memref<3x512xf32>) {
+        %c0 = arith.constant 0 : index
+
+        %0 = "affine.apply"(%c0) {map = affine_map<(d0) -> (d0)>} : (index) -> (index)
         %sm0 = memref.get_global @__shared_memory__ : memref<64x16xf32, 3>
         %sm1 = memref.get_global @__shared_memory___0 : memref<256x4xf32, 3>
         %sm2 = memref.get_global @__shared_memory___1 : memref<3x512xf32, 3>
@@ -44,7 +49,14 @@
     //     CHECK: vector.transfer_write %[[R0]], %{{.*}}[%[[Y0]], %[[X0]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<64x16xf32, 3>
     //     CHECK: vector.transfer_write %[[R1]], %{{.*}}[%[[Y1]], %[[X0]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<64x16xf32, 3>
 
-        linalg.copy(%m0, %sm0) {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<64x16xf32>, memref<64x16xf32, 3>
+        linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], 
+        iterator_types = ["parallel", "parallel"]}
+          ins(%m0 : memref<64x16xf32>) 
+          outs(%sm0 : memref<64x16xf32, 3>) 
+          attrs= {__internal_linalg_transform__ = "copy_to_workgroup_memory"} {
+          ^bb0(%arg4: f32, %s: f32):  // no predecessors
+            linalg.yield %arg4 : f32
+        }
 
     //     CHECK: %[[Y1:.*]] = affine.apply #[[$MAP3]]()[%[[TX]], %[[TY]], %[[TZ]]]
     //     CHECK: %[[R2:.*]] = vector.transfer_read %{{.*}}[%[[Y1]], %[[C0]]], %{{.*}} {in_bounds = [true, true]} : memref<256x4xf32>, vector<1x4xf32>
@@ -53,7 +65,14 @@
     //     CHECK: vector.transfer_write %[[R2]], %{{.*}}[%[[Y1]], %[[C0]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<256x4xf32, 3>
     //     CHECK: vector.transfer_write %[[R3]], %{{.*}}[%[[Y2]], %[[C0]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<256x4xf32, 3>
 
-        linalg.copy(%m1, %sm1) {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<256x4xf32>, memref<256x4xf32, 3>
+        linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], 
+        iterator_types = ["parallel", "parallel"]}
+          ins(%m1 : memref<256x4xf32>) 
+          outs(%sm1 : memref<256x4xf32, 3>) 
+          attrs= {__internal_linalg_transform__ = "copy_to_workgroup_memory"} {
+          ^bb0(%arg4: f32, %s: f32):  // no predecessors
+            linalg.yield %arg4 : f32
+        }
 
     //     CHECK: %[[X1:.*]] = affine.apply #[[$MAP5]]()[%[[TX]], %[[TY]], %[[TZ]]]
     //     CHECK: %[[R4:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[X1]]], %{{.*}} {in_bounds = [true, true]} : memref<3x512xf32>, vector<1x4xf32>
@@ -63,7 +82,14 @@
     //     CHECK: vector.transfer_write %[[R5]], %{{.*}}[%c1, %15] {in_bounds = [true, true]} : vector<1x4xf32>, memref<3x512xf32, 3>
     //     CHECK: vector.transfer_write %[[R6]], %{{.*}}[%c2, %15] {in_bounds = [true, true]} : vector<1x4xf32>, memref<3x512xf32, 3>
 
-        linalg.copy(%m2, %sm2) {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<3x512xf32>, memref<3x512xf32, 3>
+        linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], 
+        iterator_types = ["parallel", "parallel"]}
+          ins(%m2 : memref<3x512xf32>) 
+          outs(%sm2 : memref<3x512xf32, 3>) 
+          attrs= {__internal_linalg_transform__ = "copy_to_workgroup_memory"} {
+          ^bb0(%arg4: f32, %s: f32):  // no predecessors
+            linalg.yield %arg4 : f32
+        }
         gpu.barrier
         return
       }

diff --git a/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir b/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir
index f80df9e..7eec79c 100644
--- a/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir
+++ b/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir

@@ -249,7 +249,12 @@
             %9 = affine.apply affine_map<(d0) -> (d0 + 4)>(%arg0)
             %10 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg1)
             %11 = memref.subview %1[%9, %10] [%4, %7] [1, 1] : memref<?x?xi32> to memref<?x?xi32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
-            linalg.copy(%8, %11) : memref<?x?xi32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, memref<?x?xi32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
+            linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]}
+              ins(%8 : memref<?x?xi32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>) 
+              outs(%11 : memref<?x?xi32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>) {
+              ^bb0(%arg4: i32, %s: i32):  // no predecessors
+                linalg.yield %arg4 : i32
+            }
           }
         }
         return
@@ -258,9 +263,9 @@
   }
 }
 
-//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering.config<tile_sizes = {{\[}}[1, 256]{{\]}}, native_vector_size = []>
-//  CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0] -> (s0 ceildiv 256)>
-//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation.info<"LLVMGPUVectorize", workload_per_wg = [256, 1]>
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering.config<tile_sizes = {{\[}}[1, 64]{{\]}}, native_vector_size = []>
+//  CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0] -> (s0 ceildiv 64)>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation.info<"LLVMGPUVectorize", workload_per_wg = [64, 1]>
 //      CHECK: hal.executable.entry_point public @tensor_insert_slice
 // CHECK-SAME:   translation.info = #[[TRANSLATION]]
 // CHECK-NEXT:   %[[ARG0:[a-zA-Z0-9_]+]]: index
@@ -268,7 +273,7 @@
 //  CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
 //  CHECK-DAG:   %[[NWGSX:.+]] = affine.apply #[[MAP0]]()[%[[ARG0]]]
 //      CHECK:   hal.return %[[NWGSX]], %[[ARG1]], %[[C1]]
-//      CHECK:   linalg.copy
+//      CHECK:   linalg.generic
 // CHECK-SAME:     lowering.config = #[[CONFIG]]
 
 // -----

diff --git a/iree/compiler/Codegen/Passes.h b/iree/compiler/Codegen/Passes.h
index 9da0bba..84d0701 100644
--- a/iree/compiler/Codegen/Passes.h
+++ b/iree/compiler/Codegen/Passes.h

@@ -11,6 +11,7 @@
 
 #include "iree/compiler/Codegen/Dialect/LoweringConfig.h"
 #include "iree/compiler/Dialect/HAL/IR/HALOps.h"
+#include "iree/compiler/Utils/CustomKernelsTargetInfo.h"
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Pass/Pass.h"
@@ -127,6 +128,9 @@
 
 /// Pass to convert math operations to their polynomial approximation.
 std::unique_ptr<OperationPass<>> createPolynomialApproximationPass();
+
+/// Creates a pass to convert memref.copy to linalg op.
+std::unique_ptr<OperationPass<FuncOp>> createMemrefCopyToLinalgPass();
 //----------------------------------------------------------------------------//
 // Common codegen patterns.
 //----------------------------------------------------------------------------//
@@ -182,32 +186,6 @@
 // LLVMCPU Codegen specific patterns.
 //------------------------------------------------------------------------------
 
-// Some codegen patterns need to know target CPU information. They can receive
-// such information by means of this struct, which can be populated from either
-// pass options (e.g. in lit tests,
-// -iree-llvmcpu-vector-contract-custom-kernels='aarch64 dotprod')
-// or from global state (see InferCustomKernelsTargetInfoFromGlobals below).
-//
-// It would be interesting to find an opportunity to de-duplicate this with
-// other data structures containing similar information, but a difficulty here
-// is that in the case of lit tests, where we need to populate this from
-// a minimal set of custom boolean options passed to a pass such as
-// -iree-llvmcpu-vector-contract-custom-kernels, we do not have enough
-// information to populate all the other fields of existing, larger data
-// structures. That's the motivation for this custom, minimal struct.
-struct CustomKernelsTargetInfo {
-  // Indicates that the target ISA is Aarch64
-  bool aarch64 = false;
-  // Under aarch64: indicates dot-product extension (SDOT, UDOT)
-  bool dotprod = false;
-  // Indicates that intrinsics should be used rather than inline asm
-  bool intrinsics = false;
-};
-
-// Populate target_info fields from the parent HAL::ExecutableVariantOp.
-LogicalResult InferCustomKernelsTargetInfoFromParent(
-    FuncOp entryPointFn, CustomKernelsTargetInfo &target_info);
-
 /// Populates `patterns` to convert certain vector.contract ops to special
 /// "kernels" written either in SIMD intrinsics or inline assembly.
 void populateVectorContractCustomKernelsPatterns(
@@ -372,9 +350,6 @@
 /// cooperative matrix ops when possible.
 std::unique_ptr<OperationPass<FuncOp>> createSPIRVVectorToCooperativeOpsPass();
 
-/// Pass to lower linalg.copy for copying data to workgroup memory.
-std::unique_ptr<OperationPass<FuncOp>> createSPIRVCopyToWorkgroupMemoryPass();
-
 /// Pass to tile Linalg ops with tensor semantics to invocations.
 std::unique_ptr<OperationPass<FuncOp>> createSPIRVTilePass();
 

diff --git a/iree/compiler/Codegen/Passes.td b/iree/compiler/Codegen/Passes.td
index 7b90474..6821491 100644
--- a/iree/compiler/Codegen/Passes.td
+++ b/iree/compiler/Codegen/Passes.td

@@ -121,6 +121,13 @@
   let constructor =
       "mlir::iree_compiler::createPolynomialApproximationPass()";
 }
+
+def MemrefCopyToLinalgPass :
+    Pass<"iree-codegen-memrefcopy-to-linalg", "FuncOp"> {
+  let summary = "Convert memref.copy to linalg op";
+  let constructor =
+      "mlir::iree_compiler::createMemrefCopyToLinalgPass()";
+}
 //------------------------------------------------------------------------------
 // LLVMCPU
 //------------------------------------------------------------------------------
@@ -171,15 +178,15 @@
   let summary = "Enable custom kernels (inline assembly or intrinsics) for some vector.contract ops";
   let constructor = "mlir::iree_compiler::createVectorContractCustomKernelsPass()";
   let options = [
-    Option<"aarch64", "aarch64", "bool",
-            /*default=*/"false",
-           "Enable aarch64 kernels">,
-    Option<"dotprod", "dotprod", "bool",
-            /*default=*/"false",
-           "Under aarch64, enable kernels that use dotprod instructions">,
+    Option<"arch", "arch", "std::string",
+            /*default=*/"",
+           "Target architecture, e.g. aarch64">,
+    Option<"features", "features", "std::string",
+            /*default=*/"",
+           "Additional CPU feature flags, e.g. +dotprod">,
     Option<"intrinsics", "intrinsics", "bool",
             /*default=*/"false",
-           "Under aarch64, enable kernels that use dotprod instructions">,
+           "Use intrinsics over inline assembly where applicable">,
   ];
 }
 
@@ -314,13 +321,6 @@
   let constructor = "mlir::iree_compiler::createSPIRVVectorizeLoadStore()";
 }
 
-def SPIRVCopyToWorkgroupMemory :
-    Pass<"iree-spirv-copy-to-workgroup-memory", "FuncOp"> {
-  let summary = "Lower linalg.copy for copying data to workgroup memory";
-  let constructor =
-      "mlir::iree_compiler::createSPIRVCopyToWorkgroupMemoryPass()";
-}
-
 //------------------------------------------------------------------------------
 // Test passes
 //------------------------------------------------------------------------------

diff --git a/iree/compiler/Codegen/SPIRV/BUILD b/iree/compiler/Codegen/SPIRV/BUILD
index dc081d7..57efa7b 100644
--- a/iree/compiler/Codegen/SPIRV/BUILD
+++ b/iree/compiler/Codegen/SPIRV/BUILD

@@ -19,7 +19,6 @@
         "MaliConfig.cpp",
         "NVIDIAConfig.cpp",
         "Passes.cpp",
-        "SPIRVCopyToWorkgroupMemory.cpp",
         "SPIRVDistribute.cpp",
         "SPIRVInitConfigPass.cpp",
         "SPIRVLowerExecutableTargetPass.cpp",
@@ -87,5 +86,6 @@
         "@llvm-project//mlir:VectorInterfaces",
         "@llvm-project//mlir:VectorOps",
         "@llvm-project//mlir:VectorToSPIRV",
+        "@llvm-project//mlir:VectorTransforms",
     ],
 )

diff --git a/iree/compiler/Codegen/SPIRV/CMakeLists.txt b/iree/compiler/Codegen/SPIRV/CMakeLists.txt
index c8420cc..bee7ea5 100644
--- a/iree/compiler/Codegen/SPIRV/CMakeLists.txt
+++ b/iree/compiler/Codegen/SPIRV/CMakeLists.txt

@@ -24,7 +24,6 @@
     "MaliConfig.cpp"
     "NVIDIAConfig.cpp"
     "Passes.cpp"
-    "SPIRVCopyToWorkgroupMemory.cpp"
     "SPIRVDistribute.cpp"
     "SPIRVInitConfigPass.cpp"
     "SPIRVLowerExecutableTargetPass.cpp"
@@ -76,6 +75,7 @@
     MLIRVector
     MLIRVectorInterfaces
     MLIRVectorToSPIRV
+    MLIRVectorTransforms
     iree::compiler::Codegen::Common
     iree::compiler::Codegen::Dialect::IREECodegenDialect
     iree::compiler::Codegen::PassHeaders

diff --git a/iree/compiler/Codegen/SPIRV/ConvertToSPIRVPass.cpp b/iree/compiler/Codegen/SPIRV/ConvertToSPIRVPass.cpp
index cfb57c9..07587ca 100644
--- a/iree/compiler/Codegen/SPIRV/ConvertToSPIRVPass.cpp
+++ b/iree/compiler/Codegen/SPIRV/ConvertToSPIRVPass.cpp

@@ -42,7 +42,7 @@
 #include "mlir/Dialect/SPIRV/IR/SPIRVTypes.h"
 #include "mlir/Dialect/SPIRV/Transforms/SPIRVConversion.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/Dialect/Vector/VectorOps.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"

diff --git a/iree/compiler/Codegen/SPIRV/KernelConfig.cpp b/iree/compiler/Codegen/SPIRV/KernelConfig.cpp
index ae3c77d..14fb49d 100644
--- a/iree/compiler/Codegen/SPIRV/KernelConfig.cpp
+++ b/iree/compiler/Codegen/SPIRV/KernelConfig.cpp

@@ -269,7 +269,10 @@
 
   std::array<int64_t, 3> workgroupSize = {subgroupSize, 1, 1};
 
-  auto partitionedLoops = getPartitionedLoops(op);
+  auto interfaceOp = cast<IREE::Flow::PartitionableLoopsInterface>(*op);
+  auto partitionedLoops =
+      interfaceOp.getPartitionableLoops(kNumMaxParallelDims);
+
   unsigned loopDepth = partitionedLoops.back() + 1;
   SmallVector<int64_t> workgroupTileSize(loopDepth, 0);
 
@@ -301,7 +304,9 @@
                                         Operation *op) {
   LLVM_DEBUG(llvm::dbgs() << "Using default config for op: " << *op << "\n");
   FuncOp funcOp = op->getParentOfType<FuncOp>();
-  auto partitionedLoops = getPartitionedLoops(op);
+  auto interfaceOp = cast<IREE::Flow::PartitionableLoopsInterface>(*op);
+  auto partitionedLoops =
+      interfaceOp.getPartitionableLoops(kNumMaxParallelDims);
 
   // Special case for not tiled ops.
   if (partitionedLoops.empty()) {
@@ -392,10 +397,6 @@
   auto untiledResultShape = getUntiledResultShape(linalgOp, 0);
   bool vectorizable =
       !linalgOp.hasIndexSemantics() &&
-      // TODO: Skip vectorization for linalg.copy ops. Right now handling of
-      // it still goes through the old bufferization-first pipeline, while
-      // vectorization pipeline expects tensor-semantic ops.
-      !isa<linalg::CopyOp>(op) &&
       // Skip vectorization for non-minor identity inputs as it generates
       // vector.transfer_read ops with permutation maps that we currently
       // cannot lower.

diff --git a/iree/compiler/Codegen/SPIRV/Passes.cpp b/iree/compiler/Codegen/SPIRV/Passes.cpp
index b7aa1bd..af0bb0a 100644
--- a/iree/compiler/Codegen/SPIRV/Passes.cpp
+++ b/iree/compiler/Codegen/SPIRV/Passes.cpp

@@ -72,6 +72,7 @@
 /// tiling and vectorization and before buffer transformations.
 static void addLoopMaterializationPasses(OpPassManager &pm) {
   pm.addNestedPass<FuncOp>(IREE::LinalgExt::createLinalgExtToLoopsPass());
+  pm.addNestedPass<FuncOp>(createMemrefCopyToLinalgPass());
   pm.addNestedPass<FuncOp>(createConvertLinalgToLoopsPass());
   pm.addNestedPass<FuncOp>(createRemoveSingleIterationLoopPass());
 }
@@ -193,8 +194,8 @@
 //
 // In the former path for CodeGen, we perform bufferization first, which will
 // turn padding/copy (via flow.dispatch.tensor.load/store pairs) into
-// linalg.copy ops. Then we deduce CodeGen configuration from the linalg.copy op
-// and use a `lowering.config` attribute on it to drive transformations.
+// linalg.generic ops. Then we deduce CodeGen configuration from the linalg.copy
+// op and use a `lowering.config` attribute on it to drive transformations.
 //
 // In the latter path for CodeGen, we will see linalg.pad_tensor directly.
 // However, properly tiling and distributing it is an ongoing work. So for now

diff --git a/iree/compiler/Codegen/SPIRV/SPIRVCopyToWorkgroupMemory.cpp b/iree/compiler/Codegen/SPIRV/SPIRVCopyToWorkgroupMemory.cpp
deleted file mode 100644
index 56b801f..0000000
--- a/iree/compiler/Codegen/SPIRV/SPIRVCopyToWorkgroupMemory.cpp
+++ /dev/null

@@ -1,325 +0,0 @@
-// Copyright 2020 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-//===---- SPIRVCopyToWorkgroupMemoryPass.cpp ------------------------------===//
-//
-// This pass lowers linalg.copy for copying data to the workgroup memory.
-//
-//===----------------------------------------------------------------------===//
-
-#include <memory>
-#include <numeric>
-
-#include "iree/compiler/Codegen/PassDetail.h"
-#include "iree/compiler/Codegen/Passes.h"
-#include "iree/compiler/Codegen/SPIRV/MemorySpace.h"
-#include "iree/compiler/Codegen/SPIRV/Utils.h"
-#include "iree/compiler/Codegen/Transforms/Transforms.h"
-#include "iree/compiler/Codegen/Utils/MarkerUtils.h"
-#include "iree/compiler/Codegen/Utils/Utils.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "mlir/Dialect/GPU/GPUDialect.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Linalg/Transforms/CodegenStrategy.h"
-#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/SCF/SCF.h"
-#include "mlir/Dialect/SCF/Transforms.h"
-#include "mlir/Dialect/SPIRV/IR/SPIRVOps.h"
-#include "mlir/Dialect/SPIRV/IR/TargetAndABI.h"
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/Dialect/Vector/VectorOps.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/OpDefinition.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Support/LLVM.h"
-#include "mlir/Support/LogicalResult.h"
-#include "mlir/Transforms/DialectConversion.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir {
-namespace iree_compiler {
-namespace {
-
-template <typename GPUIdOp, typename GPUCountOp>
-linalg::ProcInfo getLinearizedGPUProcessorIdAndCount(
-    Location loc, ConversionPatternRewriter &rewriter) {
-  SmallVector<linalg::ProcInfo, 3> procInfo =
-      getGPUProcessorIdsAndCounts<GPUIdOp, GPUCountOp>(rewriter, loc,
-                                                       kNumGPUDims);
-  linalg::ProcInfo linearized;
-  linearized.procId = procInfo[0].procId;
-  linearized.nprocs = procInfo[0].nprocs;
-  for (unsigned i = 0; i < kNumGPUDims - 1; ++i) {
-    linearized.procId = rewriter.create<arith::MulIOp>(loc, linearized.procId,
-                                                       procInfo[i + 1].nprocs);
-    linearized.procId = rewriter.create<arith::AddIOp>(loc, linearized.procId,
-                                                       procInfo[i + 1].procId);
-    linearized.nprocs = rewriter.create<arith::MulIOp>(loc, linearized.nprocs,
-                                                       procInfo[i + 1].nprocs);
-  }
-  return linearized;
-}
-
-/// Distributes scf.parallel to processors with the processors logically
-/// arranged with same dimensionality as the number of loops, i.e. a
-/// scf.parallel with 2 loops to a 2D grid of processors. `processorIDs` and
-/// `numProcessors` must be of same size as the number of loops and are the
-/// values to use for process ID and number of processors along each dimension
-/// in the distributed code.
-/// This method accounts for the case where the number of processors is not
-/// enough to execute the entire iteration space with one iteration mapped to
-/// each processor. So implements a cyclic distribution of iterations to
-/// processors.
-LogicalResult distributeCyclicallyToProcessors(
-    ConversionPatternRewriter &rewriter, scf::ParallelOp pLoopOp,
-    ArrayRef<linalg::ProcInfo> procInfo) {
-  unsigned numLoops = pLoopOp.getNumLoops();
-  assert(numLoops == procInfo.size() &&
-         "expected as many ids as number of loops");
-  SmallVector<LoopBounds, 2> forBounds;
-  SmallVector<unsigned, 2> permutation;
-  forBounds.reserve(numLoops);
-  permutation.reserve(numLoops);
-  Location loc = pLoopOp.getLoc();
-  auto lbs = pLoopOp.getLowerBound(), ubs = pLoopOp.getUpperBound(),
-       steps = pLoopOp.getStep();
-  for (unsigned i : llvm::seq<unsigned>(0, procInfo.size())) {
-    Value mappedLb = rewriter.create<arith::AddIOp>(
-        loc, lbs[i],
-        rewriter.create<arith::MulIOp>(loc, steps[i], procInfo[i].procId));
-    Value mappedStep =
-        rewriter.create<arith::MulIOp>(loc, steps[i], procInfo[i].nprocs);
-    forBounds.push_back({mappedLb, ubs[i], mappedStep});
-    permutation.push_back(i);
-  }
-  replacePLoopOp(rewriter, pLoopOp, /*newPLoopBounds=*/{}, forBounds,
-                 permutation);
-  return success();
-}
-
-/// Returns the number of bytes copied when loading to/storing from workgorup
-/// memory. It is approximated to be the size of the underlying allocation being
-/// copied into/from.
-Optional<int64_t> getLinearizedCopySize(linalg::CopyOp copyOp) {
-  Value src = copyOp.input();
-  Value dst = copyOp.output();
-  MemRefType srcType = src.getType().cast<MemRefType>();
-  MemRefType dstType = dst.getType().cast<MemRefType>();
-
-  Value workgroupMemoryView;
-  MemRefType workgroupMemoryType;
-  if (srcType.getMemorySpaceAsInt() == getWorkgroupMemorySpace()) {
-    workgroupMemoryView = src;
-    workgroupMemoryType = srcType;
-  } else if (dstType.getMemorySpaceAsInt() == getWorkgroupMemorySpace()) {
-    workgroupMemoryView = dst;
-    workgroupMemoryType = dstType;
-  } else {
-    return {};
-  }
-
-  memref::SubViewOp workgroupMemorySubviewOp =
-      dyn_cast_or_null<memref::SubViewOp>(workgroupMemoryView.getDefiningOp());
-  if (!workgroupMemorySubviewOp) return {};
-  memref::AllocOp allocOp = dyn_cast_or_null<memref::AllocOp>(
-      workgroupMemorySubviewOp.source().getDefiningOp());
-  if (!allocOp) return {};
-
-  MemRefType allocOpType = allocOp.getType();
-  if (!allocOpType.hasStaticShape()) return {};
-  return allocOpType.getNumElements();
-}
-
-LogicalResult distributeCopyOp(linalg::CopyOp copyOp, scf::ParallelOp pLoopOp,
-                               ConversionPatternRewriter &rewriter) {
-  pLoopOp = collapseParallelLoops(rewriter, pLoopOp);
-  if (!pLoopOp) return failure();
-
-  Optional<int64_t> copyLength = getLinearizedCopySize(copyOp);
-  linalg::ProcInfo idAndCount =
-      getLinearizedGPUProcessorIdAndCount<gpu::ThreadIdOp, gpu::BlockDimOp>(
-          copyOp.getLoc(), rewriter);
-  auto workgroupSize =
-      spirv::lookupLocalWorkGroupSize(copyOp).getValues<APInt>();
-  int64_t linearizedWorkgroupSize = std::accumulate(
-      workgroupSize.begin(), workgroupSize.end(), 1,
-      [](int64_t total, APInt value) { return total * value.getSExtValue(); });
-
-  if (copyLength.hasValue() && !workgroupSize.empty() &&
-      copyLength.getValue() <= linearizedWorkgroupSize) {
-    return distributeSingleIterationPerProcessor(rewriter, pLoopOp, idAndCount,
-                                                 /*generateGuard=*/true);
-  }
-  return distributeCyclicallyToProcessors(rewriter, pLoopOp, idAndCount);
-}
-
-// Applies tiling followed to load/store optimized size then distribute on
-// incovations.
-LogicalResult tileAndDistributeCopy(linalg::CopyOp copyOp, ValueRange operands,
-                                    ConversionPatternRewriter &rewriter) {
-  linalg::LinalgTilingOptions options;
-  // Tile to memory access of 128bits as those tend to be optimal on most GPUs.
-  constexpr unsigned vecLoadBits = 128;
-  unsigned elementBits =
-      copyOp.getSource().getType().cast<MemRefType>().getElementTypeBitWidth();
-  if (elementBits == 0 || vecLoadBits % elementBits != 0) return failure();
-  unsigned numElement = vecLoadBits / elementBits;
-  options.setTileSizes({1, numElement})
-      .setLoopType(linalg::LinalgTilingLoopType::ParallelLoops);
-  Optional<linalg::TiledLinalgOp> tiledOp =
-      linalg::tileLinalgOp(rewriter, copyOp, options);
-  if (!tiledOp) return failure();
-  if (tiledOp->loops.empty()) return success();
-  setMarker(tiledOp->op, getVectorizeMarker());
-  auto pLoopOp = cast<scf::ParallelOp>(tiledOp->loops[0]);
-  return distributeCopyOp(copyOp, pLoopOp, rewriter);
-}
-
-// Pattern to tile and distribute linalg::CopyOp.
-struct TileAndDistributeCopyOp : public OpConversionPattern<linalg::CopyOp> {
-  using OpConversionPattern<linalg::CopyOp>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      linalg::CopyOp linalgOp, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    if (!hasMarker(linalgOp, getCopyToWorkgroupMemoryMarker())) {
-      return failure();
-    }
-    if (failed(
-            tileAndDistributeCopy(linalgOp, adaptor.getOperands(), rewriter))) {
-      return failure();
-    }
-
-    // Insert a barrier if read or write shared memory.
-    if (llvm::any_of(linalgOp.getOperands(), [](Value output) {
-          return output.getType().cast<MemRefType>().getMemorySpaceAsInt() ==
-                 getWorkgroupMemorySpace();
-        })) {
-      rewriter.create<spirv::ControlBarrierOp>(
-          linalgOp.getLoc(), spirv::Scope::Workgroup, spirv::Scope::Workgroup,
-          spirv::MemorySemantics::AcquireRelease);
-    }
-    rewriter.eraseOp(linalgOp);
-    return success();
-  }
-};
-
-/// CopyOp that are loading to/storing from workgroup memory are special cased
-/// to use all workitems to do a copy. This is done by linearizing the copy
-/// operation.
-// TODO(ravishankarm): This linearization is achieved through collapsing the
-// generated parallel loops from a multi-dimensional copy. Such lowering results
-// in mods/divs in the collapsed loop body. This can be removed by reshaping the
-// copy to be a 1D copy. This seems to be hitting an error in reshape
-// canonicalization. Investigate this further.
-struct SerializeAndDistributeCopy : public OpConversionPattern<linalg::CopyOp> {
-  using OpConversionPattern::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      linalg::CopyOp copyOp, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    if (!hasMarker(copyOp, {getCopyToWorkgroupMemoryMarker()}))
-      return failure();
-
-    Optional<linalg::LinalgLoops> loops =
-        linalg::linalgOpToParallelLoops(rewriter, copyOp);
-    if (!loops) return failure();
-    if (!loops.getValue().empty()) {
-      auto pLoopOp = cast<scf::ParallelOp>(loops.getValue()[0]);
-      if (failed(distributeCopyOp(copyOp, pLoopOp, rewriter))) return failure();
-    }
-
-    // If the `copyOp` writes to workgroup memory insert barrier after the
-    // op.
-    if (llvm::any_of(copyOp.getOperands(), [](Value output) {
-          MemRefType outputType = output.getType().dyn_cast<MemRefType>();
-          return outputType &&
-                 outputType.getMemorySpaceAsInt() == getWorkgroupMemorySpace();
-        })) {
-      rewriter.create<spirv::ControlBarrierOp>(
-          copyOp.getLoc(), spirv::Scope::Workgroup, spirv::Scope::Workgroup,
-          spirv::MemorySemantics::AcquireRelease);
-    }
-
-    rewriter.eraseOp(copyOp);
-    return success();
-  }
-};
-
-struct SPIRVCopyToWorkgroupMemoryPass
-    : public SPIRVCopyToWorkgroupMemoryBase<SPIRVCopyToWorkgroupMemoryPass> {
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<AffineDialect, gpu::GPUDialect, memref::MemRefDialect,
-                    scf::SCFDialect, vector::VectorDialect>();
-  }
-
-  void runOnOperation() override;
-
- private:
-  void tileAndVectorizeLinalgCopy(FuncOp funcOp, MLIRContext *context);
-  void lowerVectorOps(FuncOp funcOp, MLIRContext *context);
-};
-
-void SPIRVCopyToWorkgroupMemoryPass::tileAndVectorizeLinalgCopy(
-    FuncOp funcOp, MLIRContext *context) {
-  // 1. Tile linalg and distribute it on invocations.
-  std::unique_ptr<ConversionTarget> target =
-      std::make_unique<ConversionTarget>(*context);
-  target->addDynamicallyLegalOp<linalg::CopyOp>([&](linalg::CopyOp copy) {
-    return !(hasMarker(copy, getCopyToWorkgroupMemoryMarker()));
-  });
-  target->markUnknownOpDynamicallyLegal([](Operation *) { return true; });
-
-  RewritePatternSet patterns(&getContext());
-  // TODO(antiagainst): Re-enable vectorizing workgroup memory copy once the
-  // whole pipeline is in a better state.
-  // patterns.add<TileAndDistributeCopyOp>(context);
-  patterns.add<SerializeAndDistributeCopy>(context);
-  if (failed(applyPartialConversion(funcOp, *target, std::move(patterns)))) {
-    return signalPassFailure();
-  }
-
-  // 2. Canonicalize the IR generated by tiling.
-  RewritePatternSet canonicalizePatterns =
-      linalg::getLinalgTilingCanonicalizationPatterns(context);
-  populateAffineMinCanonicalizationPattern(canonicalizePatterns);
-  scf::populateSCFForLoopCanonicalizationPatterns(canonicalizePatterns);
-  if (failed(applyPatternsAndFoldGreedily(funcOp,
-                                          std::move(canonicalizePatterns)))) {
-    return signalPassFailure();
-  }
-
-  // 3. Vectorize the tiled linalg to be able to map it to load/store vector.
-  RewritePatternSet vectorizationPatterns(&getContext());
-  linalg::VectorizationPatterns<linalg::CopyOp>::insert(
-      vectorizationPatterns, linalg::LinalgVectorizationOptions(),
-      linalg::LinalgTransformationFilter(
-          StringAttr::get(context, getVectorizeMarker()), {}));
-  if (failed(applyPatternsAndFoldGreedily(funcOp,
-                                          std::move(vectorizationPatterns)))) {
-    return signalPassFailure();
-  }
-}
-
-void SPIRVCopyToWorkgroupMemoryPass::runOnOperation() {
-  MLIRContext *context = &getContext();
-  FuncOp funcOp = getOperation();
-  tileAndVectorizeLinalgCopy(funcOp, context);
-}
-}  // namespace
-
-//===----------------------------------------------------------------------===//
-// Pass entry point and registration
-//===----------------------------------------------------------------------===//
-std::unique_ptr<OperationPass<FuncOp>> createSPIRVCopyToWorkgroupMemoryPass() {
-  return std::make_unique<SPIRVCopyToWorkgroupMemoryPass>();
-}
-
-}  // namespace iree_compiler
-}  // namespace mlir

diff --git a/iree/compiler/Codegen/SPIRV/SPIRVTileAndDistribute.cpp b/iree/compiler/Codegen/SPIRV/SPIRVTileAndDistribute.cpp
index af35198..816817b 100644
--- a/iree/compiler/Codegen/SPIRV/SPIRVTileAndDistribute.cpp
+++ b/iree/compiler/Codegen/SPIRV/SPIRVTileAndDistribute.cpp

@@ -31,7 +31,7 @@
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/SCF/Transforms.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/Dialect/Vector/VectorTransforms.h"
+#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/IR/PatternMatch.h"
@@ -100,12 +100,13 @@
       getLinalgMatchAndReplaceMarker(matchMarkers, getVectorizeMarker(),
                                      context)
           .setMatchByDefault();
-  linalg::TilingPatterns<
-      linalg::CopyOp, linalg::Conv1DNwcWcfOp, linalg::Conv3DNdhwcDhwcfOp,
-      linalg::DepthwiseConv2DNhwcHwcmOp, linalg::FillOp, linalg::GenericOp,
-      linalg::PoolingNhwcMaxOp, linalg::PoolingNhwcMinOp,
-      linalg::PoolingNhwcSumOp>::insert(patterns, tilingOptions,
-                                        filterVectorized);
+  linalg::TilingPatterns<linalg::Conv1DNwcWcfOp, linalg::Conv3DNdhwcDhwcfOp,
+                         linalg::DepthwiseConv2DNhwcHwcmOp, linalg::FillOp,
+                         linalg::GenericOp, linalg::PoolingNhwcMaxOp,
+                         linalg::PoolingNhwcMinOp,
+                         linalg::PoolingNhwcSumOp>::insert(patterns,
+                                                           tilingOptions,
+                                                           filterVectorized);
 
   linalg::LinalgTransformationFilter filterTiled =
       getLinalgMatchAndReplaceMarker(matchMarkers, getTileReductionMarker(),

diff --git a/iree/compiler/Codegen/SPIRV/SPIRVTileAndVectorizeToCooperativeOps.cpp b/iree/compiler/Codegen/SPIRV/SPIRVTileAndVectorizeToCooperativeOps.cpp
index d9fac5a..c0f32a7 100644
--- a/iree/compiler/Codegen/SPIRV/SPIRVTileAndVectorizeToCooperativeOps.cpp
+++ b/iree/compiler/Codegen/SPIRV/SPIRVTileAndVectorizeToCooperativeOps.cpp

@@ -27,8 +27,8 @@
 #include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/SPIRV/IR/TargetAndABI.h"
-#include "mlir/Dialect/Vector/VectorOps.h"
-#include "mlir/Dialect/Vector/VectorTransforms.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
 #include "mlir/Interfaces/VectorInterfaces.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"

diff --git a/iree/compiler/Codegen/SPIRV/SPIRVVectorToCooperativeOps.cpp b/iree/compiler/Codegen/SPIRV/SPIRVVectorToCooperativeOps.cpp
index 8113edd..abb6328 100644
--- a/iree/compiler/Codegen/SPIRV/SPIRVVectorToCooperativeOps.cpp
+++ b/iree/compiler/Codegen/SPIRV/SPIRVVectorToCooperativeOps.cpp

@@ -17,7 +17,7 @@
 #include "mlir/Dialect/SPIRV/IR/TargetAndABI.h"
 #include "mlir/Dialect/SPIRV/Transforms/SPIRVConversion.h"
 #include "mlir/Dialect/Utils/StructuredOpsUtils.h"
-#include "mlir/Dialect/Vector/VectorOps.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"

diff --git a/iree/compiler/Codegen/SPIRV/SPIRVVectorize.cpp b/iree/compiler/Codegen/SPIRV/SPIRVVectorize.cpp
index 57a8ac0..9bdeb80 100644
--- a/iree/compiler/Codegen/SPIRV/SPIRVVectorize.cpp
+++ b/iree/compiler/Codegen/SPIRV/SPIRVVectorize.cpp

@@ -21,8 +21,8 @@
 #include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/SPIRV/IR/TargetAndABI.h"
-#include "mlir/Dialect/Vector/VectorOps.h"
-#include "mlir/Dialect/Vector/VectorTransforms.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
 #include "mlir/Interfaces/VectorInterfaces.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"

diff --git a/iree/compiler/Codegen/SPIRV/SPIRVVectorizeLoadStore.cpp b/iree/compiler/Codegen/SPIRV/SPIRVVectorizeLoadStore.cpp
index a5bfc1f..b670d66 100644
--- a/iree/compiler/Codegen/SPIRV/SPIRVVectorizeLoadStore.cpp
+++ b/iree/compiler/Codegen/SPIRV/SPIRVVectorizeLoadStore.cpp

@@ -22,7 +22,7 @@
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/Dialect/Vector/VectorOps.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/PatternMatch.h"

diff --git a/iree/compiler/Codegen/SPIRV/Utils.cpp b/iree/compiler/Codegen/SPIRV/Utils.cpp
index 5bcd11b..0af519b 100644
--- a/iree/compiler/Codegen/SPIRV/Utils.cpp
+++ b/iree/compiler/Codegen/SPIRV/Utils.cpp

@@ -59,51 +59,6 @@
   return success();
 }
 
-LogicalResult copyToWorkgroupMemory(OpBuilder &b, Value src, Value dst) {
-  auto copyOp = b.create<linalg::CopyOp>(src.getLoc(), src, dst);
-  setMarker(copyOp, getCopyToWorkgroupMemoryMarker());
-  return success();
-}
-
-Optional<Value> allocateWorkgroupMemory(OpBuilder &b, memref::SubViewOp subview,
-                                        ArrayRef<Value> boundingSubViewSize,
-                                        DataLayout &layout) {
-  // Allocate the memory into the entry block of the parent FuncOp. This better
-  // aligns with the semantics of this memory which is available at the entry of
-  // the function.
-  OpBuilder::InsertionGuard guard(b);
-  FuncOp funcOp = subview->getParentOfType<FuncOp>();
-  if (!funcOp) {
-    subview.emitError("expected op to be within std.func");
-    return llvm::None;
-  }
-  b.setInsertionPointToStart(&(*funcOp.getBody().begin()));
-  // The bounding subview size is expected to be constant. This specified the
-  // shape of the allocation.
-  SmallVector<int64_t, 2> shape = llvm::to_vector<2>(
-      llvm::map_range(boundingSubViewSize, [](Value v) -> int64_t {
-        APInt value;
-        if (matchPattern(v, m_ConstantInt(&value))) return value.getSExtValue();
-        return -1;
-      }));
-  if (llvm::any_of(shape, [](int64_t v) { return v == -1; })) return {};
-  MemRefType allocType = MemRefType::get(
-      shape, subview.getType().getElementType(), {}, getWorkgroupMemorySpace());
-  Value buffer = b.create<memref::AllocOp>(subview.getLoc(), allocType);
-  return buffer;
-}
-
-LogicalResult deallocateWorkgroupMemory(OpBuilder &b, Value buffer) {
-  // There is no utility of an explicit deallocation (as of now). Instead the
-  // workgroup memory is effectively stack memory that is automatically dead at
-  // the end of the function. The SPIR-V lowering treats such deallocs as
-  // no-ops. So dont insert it in the first place, rather just check that the
-  // deallocation is for workgroup memory.
-  MemRefType bufferType = buffer.getType().dyn_cast<MemRefType>();
-  if (!bufferType) return failure();
-  return success(bufferType.getMemorySpaceAsInt() == getWorkgroupMemorySpace());
-}
-
 template <typename GPUIdOp, typename GPUCountOp>
 static linalg::ProcInfo getGPUProcessorIdAndCountImpl(OpBuilder &builder,
                                                       Location loc,

diff --git a/iree/compiler/Codegen/SPIRV/Utils.h b/iree/compiler/Codegen/SPIRV/Utils.h
index 0ce2c15..9b70c28 100644
--- a/iree/compiler/Codegen/SPIRV/Utils.h
+++ b/iree/compiler/Codegen/SPIRV/Utils.h

@@ -41,24 +41,6 @@
 const char *getSPIRVDistributeAttrName();
 
 //===----------------------------------------------------------------------===//
-// Workgroup memory utils
-//===----------------------------------------------------------------------===//
-
-/// Allocation callback for allocation workgroup local memory.
-Optional<Value> allocateWorkgroupMemory(OpBuilder &b, memref::SubViewOp subview,
-                                        ArrayRef<Value> boundingSubViewSize,
-                                        DataLayout &layout);
-
-/// Function used as callback for copyin/copyout in promotion pattern used
-/// to promote subviews to workgroup memory when the number of threads is
-/// known to be greater than equal to the number of iteration of loops the
-/// copy is lowered to.
-LogicalResult copyToWorkgroupMemory(OpBuilder &b, Value src, Value dst);
-
-/// Deallocation callback for allocation workgroup local memory.
-LogicalResult deallocateWorkgroupMemory(OpBuilder &b, Value buffer);
-
-//===----------------------------------------------------------------------===//
 // Processor ID/size utils
 //===----------------------------------------------------------------------===//
 

diff --git a/iree/compiler/Codegen/SPIRV/test/BUILD b/iree/compiler/Codegen/SPIRV/test/BUILD
index 1eda09a..eed56fb 100644
--- a/iree/compiler/Codegen/SPIRV/test/BUILD
+++ b/iree/compiler/Codegen/SPIRV/test/BUILD

@@ -40,7 +40,6 @@
             "tile_and_vectorize_matmul.mlir",
             "tile_and_vectorize_to_cooperative_ops.mlir",
             "vector_to_cooperative_matrix.mlir",
-            "vectorize_copy_to_workgroup_memory.mlir",
             "vectorize_elementwise_ops.mlir",
             "vectorize_matmul.mlir",
             "vectorize_load_store.mlir",

diff --git a/iree/compiler/Codegen/SPIRV/test/CMakeLists.txt b/iree/compiler/Codegen/SPIRV/test/CMakeLists.txt
index 4dcce06..d9282f5 100644
--- a/iree/compiler/Codegen/SPIRV/test/CMakeLists.txt
+++ b/iree/compiler/Codegen/SPIRV/test/CMakeLists.txt

@@ -35,7 +35,6 @@
     "tile_and_vectorize_matmul.mlir"
     "tile_and_vectorize_to_cooperative_ops.mlir"
     "vector_to_cooperative_matrix.mlir"
-    "vectorize_copy_to_workgroup_memory.mlir"
     "vectorize_elementwise_ops.mlir"
     "vectorize_load_store.mlir"
     "vectorize_matmul.mlir"

diff --git a/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ext_ops.mlir b/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ext_ops.mlir
index f353485..58c867c 100644
--- a/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ext_ops.mlir
+++ b/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ext_ops.mlir

@@ -87,7 +87,11 @@
             %8 = memref.subview %0[%arg0, 0, %arg1] [%4, 32, %7] [1, 1, 1] : memref<64x32x128xi32> to memref<?x32x?xi32, affine_map<(d0, d1, d2)[s0] -> (d0 * 4096 + s0 + d1 * 128 + d2)>>
             %9 = memref.cast %8 : memref<?x32x?xi32, affine_map<(d0, d1, d2)[s0] -> (d0 * 4096 + s0 + d1 * 128 + d2)>> to memref<?x?x?xi32>
             %10 = memref.subview %1[%arg0, 0, %arg1] [%4, 32, %7] [1, 1, 1] : memref<64x32x128xi32> to memref<?x32x?xi32, affine_map<(d0, d1, d2)[s0] -> (d0 * 4096 + s0 + d1 * 128 + d2)>>
-            linalg.copy(%9, %10) : memref<?x?x?xi32>, memref<?x32x?xi32, affine_map<(d0, d1, d2)[s0] -> (d0 * 4096 + s0 + d1 * 128 + d2)>>
+            linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]}
+              ins(%9 : memref<?x?x?xi32>) outs(%10 : memref<?x32x?xi32, affine_map<(d0, d1, d2)[s0] -> (d0 * 4096 + s0 + d1 * 128 + d2)>>) {
+              ^bb0(%arg4: i32, %s: i32):  // no predecessors
+                linalg.yield %arg4 : i32
+            }
             iree_linalg_ext.sort dimension(1) {__internal_linalg_transform__ = "workgroup"} outs(%10 : memref<?x32x?xi32, affine_map<(d0, d1, d2)[s0] -> (d0 * 4096 + s0 + d1 * 128 + d2)>>)  {
             ^bb0(%arg2: i32, %arg3: i32):  // no predecessors
               %11 = arith.cmpi slt, %arg2, %arg3 : i32

diff --git a/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ops.mlir b/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ops.mlir
index 0df5042..d4dcf98 100644
--- a/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ops.mlir
+++ b/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ops.mlir

@@ -97,7 +97,11 @@
             %9 = affine.apply affine_map<(d0) -> (d0 + 4)>(%arg0)
             %10 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg1)
             %11 = memref.subview %1[%9, %10] [%4, %7] [1, 1] : memref<?x?xi32> to memref<?x?xi32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
-            linalg.copy(%8, %11) : memref<?x?xi32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, memref<?x?xi32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
+            linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]}
+              ins(%8 : memref<?x?xi32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>) outs(%11 : memref<?x?xi32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>) {
+              ^bb0(%arg4: i32, %s: i32):  // no predecessors
+                linalg.yield %arg4 : i32
+            }
           }
         }
         return
@@ -116,7 +120,7 @@
 //  CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
 //  CHECK-DAG:   %[[NWGSX:.+]] = affine.apply #[[MAP]]()[%[[ARG0]]]
 //      CHECK:   hal.return %[[NWGSX]], %[[ARG1]], %[[C1]]
-//      CHECK:   linalg.copy
+//      CHECK:   linalg.generic
 // CHECK-SAME:     lowering.config = #[[CONFIG]]
 
 // -----
@@ -166,7 +170,12 @@
               %10 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 3)>(%arg2)[%workgroup_size_x]
               %11 = memref.subview %1[0, %arg0, %arg1, %arg2] [1, %4, %7, %10] [1, 1, 1, 1] : memref<1x224x224x3xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 150528 + s0 + d1 * 672 + d2 * 3 + d3)>>
               %12 = memref.subview %0[0, %arg0, %arg1, %arg2] [1, %4, %7, %10] [1, 1, 1, 1] : memref<1x225x225x3xf32> to memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
-              linalg.copy(%11, %12) : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 150528 + s0 + d1 * 672 + d2 * 3 + d3)>>, memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>
+              linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
+                ins(%11 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 150528 + s0 + d1 * 672 + d2 * 3 + d3)>>) 
+                outs(%12 : memref<1x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 151875 + s0 + d1 * 675 + d2 * 3 + d3)>>) {
+                ^bb0(%arg4: f32, %s: f32):  // no predecessors
+                  linalg.yield %arg4 : f32
+              }
             }
           }
         }
@@ -188,7 +197,7 @@
 //  CHECK-DAG:   %[[Z_COUNT:.+]] = affine.apply #[[MAP_Z]]()[%[[Z]]]
 //      CHECK:   hal.return %[[X]], %[[Y_COUNT]], %[[Z_COUNT]]
 
-//      CHECK:   linalg.copy
+//      CHECK:   linalg.generic
 // CHECK-SAME:     lowering.config = #[[CONFIG]]
 
 // -----

diff --git a/iree/compiler/Codegen/SPIRV/test/promote_workgroup_memory.mlir b/iree/compiler/Codegen/SPIRV/test/promote_workgroup_memory.mlir
index 39d8cb1..2aa76c7 100644
--- a/iree/compiler/Codegen/SPIRV/test/promote_workgroup_memory.mlir
+++ b/iree/compiler/Codegen/SPIRV/test/promote_workgroup_memory.mlir

@@ -62,9 +62,9 @@
 //       CHECK:     %[[RET0SV:.+]] = memref.subview %[[RET0]]
 //       CHECK:     %[[SUBVIEW1:.+]] = memref.subview %[[ALLOC1]]
 //       CHECK:     %[[SUBVIEW2:.+]] = memref.subview %[[ALLOC2]]
-//       CHECK:     linalg.copy(%[[ARG0SV]], %[[SUBVIEW1]])
+//       CHECK:     linalg.generic(%[[ARG0SV]], %[[SUBVIEW1]])
 //  CHECK-SAME:       "copy_to_workgroup_memory"
-//       CHECK:     linalg.copy(%[[ARG1SV]], %[[SUBVIEW2]])
+//       CHECK:     linalg.generic(%[[ARG1SV]], %[[SUBVIEW2]])
 //  CHECK-SAME:       "copy_to_workgroup_memory"
 //       CHECK:     scf.for
 //       CHECK:       scf.for
@@ -125,7 +125,7 @@
 //       CHECK:   %[[ARG1SV:.+]] = memref.subview %[[ARG1]]
 //       CHECK:   %[[RET0SV:.+]] = memref.subview %[[RET0]]
 //       CHECK:   %[[SUBVIEW1:.+]] = memref.subview %[[ALLOC1]]
-//       CHECK:   linalg.copy(%[[ARG1SV]], %[[SUBVIEW1]])
+//       CHECK:   linalg.generic(%[[ARG1SV]], %[[SUBVIEW1]])
 //  CHECK-SAME:      "copy_to_workgroup_memory"
 //       CHECK:   scf.for
 //       CHECK:     scf.for

diff --git a/iree/compiler/Codegen/SPIRV/test/tile_and_distribute_sort.mlir b/iree/compiler/Codegen/SPIRV/test/tile_and_distribute_sort.mlir
index 5870a8e..6cd1344 100644
--- a/iree/compiler/Codegen/SPIRV/test/tile_and_distribute_sort.mlir
+++ b/iree/compiler/Codegen/SPIRV/test/tile_and_distribute_sort.mlir

@@ -33,7 +33,12 @@
             %5 = memref.cast %4 : memref<1x32x16xi32, affine_map<(d0, d1, d2)[s0] -> (d0 * 4096 + s0 + d1 * 128 + d2)>> to memref<?x?x?xi32>
             %6 = memref.subview %1[%arg0, 0, %arg1] [1, 32, 16] [1, 1, 1] : memref<64x32x128xi32> to memref<1x32x16xi32, affine_map<(d0, d1, d2)[s0] -> (d0 * 4096 + s0 + d1 * 128 + d2)>>
             %7 = memref.cast %6 : memref<1x32x16xi32, affine_map<(d0, d1, d2)[s0] -> (d0 * 4096 + s0 + d1 * 128 + d2)>> to memref<?x32x?xi32, affine_map<(d0, d1, d2)[s0] -> (d0 * 4096 + s0 + d1 * 128 + d2)>>
-            linalg.copy(%5, %6) {lowering.config = #config} : memref<?x?x?xi32>, memref<1x32x16xi32, affine_map<(d0, d1, d2)[s0] -> (d0 * 4096 + s0 + d1 * 128 + d2)>>
+            linalg.generic {lowering.config = #config, indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]}
+              ins(%5 : memref<?x?x?xi32>) 
+              outs(%6 : memref<1x32x16xi32, affine_map<(d0, d1, d2)[s0] -> (d0 * 4096 + s0 + d1 * 128 + d2)>>) {
+              ^bb0(%arg4: i32, %s: i32):  // no predecessors
+                linalg.yield %arg4 : i32
+            }
             iree_linalg_ext.sort dimension(1) {lowering.config = #config} outs(%7 : memref<?x32x?xi32, affine_map<(d0, d1, d2)[s0] -> (d0 * 4096 + s0 + d1 * 128 + d2)>>)  {
             ^bb0(%arg2: i32, %arg3: i32):  // no predecessors
               %8 = arith.cmpi slt, %arg2, %arg3 : i32
@@ -63,7 +68,7 @@
 //       CHECK:       scf.for %[[IV_X:.+]] = %[[TID_X]] to %{{.+}} step %[[DIM_X]]
 //       CHECK:         %[[COPY_SOURCE:.+]] = memref.subview %[[WG_INPUT_CAST]][%[[IV_Y]], 0, %[[IV_X]]]
 //       CHECK:         %[[COPY_DEST:.+]] = memref.subview %[[WG_OUTPUT]][%[[IV_Y]], 0, %[[IV_X]]]
-//       CHECK:         linalg.copy(%[[COPY_SOURCE]], %[[COPY_DEST]])
+//       CHECK:         linalg.generic {{.*}} ins(%[[COPY_SOURCE]] {{.*}} outs(%[[COPY_DEST]]
 //       CHECK:     scf.for %[[IV_Y:.+]] = %[[TID_Y]] to %{{.+}} step %[[DIM_Y]]
 //       CHECK:       scf.for %[[IV_X:.+]] = %[[TID_X]] to %{{.+}} step %[[DIM_X]]
 //       CHECK:         %[[COPY_DEST:.+]] = memref.subview %[[WG_OUTPUT]][%[[IV_Y]], 0, %[[IV_X]]]

diff --git a/iree/compiler/Codegen/SPIRV/test/vectorize_copy_to_workgroup_memory.mlir b/iree/compiler/Codegen/SPIRV/test/vectorize_copy_to_workgroup_memory.mlir
deleted file mode 100644
index de6086d..0000000
--- a/iree/compiler/Codegen/SPIRV/test/vectorize_copy_to_workgroup_memory.mlir
+++ /dev/null

@@ -1,43 +0,0 @@
-// RUN: iree-opt -split-input-file -iree-spirv-copy-to-workgroup-memory %s
-// TODO(antiagainst): Re-enable vectorizing workgroup memory copy once the
-// whole pipeline is in a better state.
-// | FileCheck %s
-
-#map0 = affine_map<(d0, d1)[s0] -> (d0 * 4096 + s0 + d1)>
-
-module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
-  func @copy(%arg0: memref<4096x4096xf32>) attributes {spv.entry_point_abi = {local_size = dense<[128, 1, 1]> : vector<3xi32>}} {
-    %a = memref.alloc() : memref<128x32xf32, 3>
-    %c0 = arith.constant 0 : index
-    %sv = memref.subview %arg0[%c0, %c0] [128, 32] [1, 1]  : memref<4096x4096xf32> to memref<128x32xf32, #map0>
-    linalg.copy(%sv, %a) {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<128x32xf32, #map0>, memref<128x32xf32, 3>
-    return
-  }
-    // CHECK: #[[MAP1:.+]] = affine_map<(d0) -> (d0 * 4)>
-
-    // CHECK-DAG: %[[C1024:.+]] = arith.constant 1024 : index
-    // CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index
-    // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
-    // CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<128x32xf32, 3>
-    // CHECK: %[[DST:.+]]  = memref.subview %{{.+}}[0, 0] [128, 32] [1, 1]  : memref<4096x4096xf32> to memref<128x32xf32, #map0>
-    // CHECK: %[[TIDx:.+]] = gpu.thread_id x
-    // CHECK: %[[DIMx:.+]] = gpu.block_dim x
-    // CHECK: %[[TIDy:.+]] = gpu.thread_id y
-    // CHECK: %[[DIMy:.+]] = gpu.block_dim y
-    // CHECK: %[[TIDz:.+]] = gpu.thread_id z
-    // CHECK: %[[DIMz:.+]] = gpu.block_dim z
-    // CHECK: %[[LIDz:.+]] = arith.muli %[[TIDz]], %[[DIMy]] : index
-    // CHECK: %[[LIDzy:.+]] = arith.addi %[[LIDz]], %[[TIDy]] : index
-    // CHECK: %[[DIMzy:.+]] = arith.muli %[[DIMz]], %[[DIMy]] : index
-    // CHECK: %[[LIDzyx:.+]] = arith.muli %[[LIDzy]], %[[DIMx]] : index
-    // CHECK: %[[LID:.+]] = arith.addi %[[LIDzyx]], %[[TIDx]] : index
-    // CHECK: %[[DIMzyx:.+]] = arith.muli %[[DIMzy]], %[[DIMx]] : index
-    // CHECK: scf.for %[[IV:.+]] = %[[LID]] to %[[C1024]] step %[[DIMzyx]] {
-    // CHECK:   %[[SIZEx:.+]] = arith.divsi %[[IV]], %[[C8]] : index
-    // CHECK:   %[[MOD:.+]] = arith.remsi %[[IV]], %[[C8]] : index
-    // CHECK:   %[[SIZEy:.+]] = affine.apply #[[MAP1]](%[[MOD]])
-    // CHECK:   %[[SVs:.+]] = memref.subview %[[DST]][%[[SIZEx]], %[[SIZEy]]] [1, 4] [1, 1]  : memref<128x32xf32, #map0> to memref<1x4xf32
-    // CHECK:   %[[SVd:.+]] = memref.subview %[[ALLOC]][%[[SIZEx]], %[[SIZEy]]] [1, 4] [1, 1]  : memref<128x32xf32, 3> to memref<1x4xf32
-    // CHECK:   %[[LOAD:.+]] = vector.transfer_read %[[SVs]][%c0, %c0], %cst {{.*}} : memref<1x4xf32, {{.*}}>, vector<1x4xf32>
-    // CHECK:   vector.transfer_write %[[LOAD]], %[[SVd]][%[[C0]], %[[C0]]] {{.*}} : vector<1x4xf32>, memref<1x4xf32
-}

diff --git a/iree/compiler/Codegen/Sandbox/BUILD b/iree/compiler/Codegen/Sandbox/BUILD
index 9c813ca..0663539 100644
--- a/iree/compiler/Codegen/Sandbox/BUILD
+++ b/iree/compiler/Codegen/Sandbox/BUILD

@@ -61,9 +61,11 @@
         "@llvm-project//mlir:LinalgTransforms",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:SCFUtils",
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:Transforms",
         "@llvm-project//mlir:VectorOps",
+        "@llvm-project//mlir:VectorTransforms",
         "@llvm-project//mlir:X86VectorTransforms",
     ],
 )

diff --git a/iree/compiler/Codegen/Sandbox/CMakeLists.txt b/iree/compiler/Codegen/Sandbox/CMakeLists.txt
index 42f7e61..ad7ad47 100644
--- a/iree/compiler/Codegen/Sandbox/CMakeLists.txt
+++ b/iree/compiler/Codegen/Sandbox/CMakeLists.txt

@@ -52,9 +52,11 @@
     MLIRLinalgTransforms
     MLIRPass
     MLIRSCF
+    MLIRSCFUtils
     MLIRTensor
     MLIRTransforms
     MLIRVector
+    MLIRVectorTransforms
     MLIRX86VectorTransforms
     iree::compiler::Codegen::Dialect::IREECodegenDialect
     iree::compiler::Codegen::Utils

diff --git a/iree/compiler/Codegen/Sandbox/LinalgTensorCodegenDriver.cpp b/iree/compiler/Codegen/Sandbox/LinalgTensorCodegenDriver.cpp
index 78c111e..b3a981a 100644
--- a/iree/compiler/Codegen/Sandbox/LinalgTensorCodegenDriver.cpp
+++ b/iree/compiler/Codegen/Sandbox/LinalgTensorCodegenDriver.cpp

@@ -16,9 +16,11 @@
 #include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/Linalg/Utils/Utils.h"
-#include "mlir/Dialect/Vector/VectorTransforms.h"
+#include "mlir/Dialect/SCF/Utils/Utils.h"
+#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
 #include "mlir/Dialect/X86Vector/Transforms.h"
 #include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
 
 using namespace mlir;
@@ -79,6 +81,19 @@
 //===----------------------------------------------------------------------===//
 
 namespace {
+
+static void getAtMostNEnclosingLoops(
+    Operation *op, int64_t nLoops,
+    SmallVector<scf::ForOp> &reverseEnclosingLoops) {
+  scf::ForOp outermostEnclosingForOp = nullptr;
+  Operation *nextEnclosingOp = op->getParentOp();
+  while (nLoops-- > 0 &&
+         (outermostEnclosingForOp = dyn_cast<scf::ForOp>(nextEnclosingOp))) {
+    reverseEnclosingLoops.push_back(outermostEnclosingForOp);
+    nextEnclosingOp = outermostEnclosingForOp->getParentOp();
+  }
+}
+
 struct LinalgFusePass : public LinalgFuseBase<LinalgFusePass> {
   LinalgFusePass(int64_t tilingLevel = -1, bool vectorize = false) {
     this->tilingLevel.setValue(tilingLevel);
@@ -93,6 +108,7 @@
     this->pad = options.pad;
     this->packPaddings = options.packPaddings;
     this->hoistPaddings = options.hoistPaddings;
+    this->transposePaddings = options.transposePaddings;
     this->vectorize = options.vectorize;
     this->vectorizePadding = options.vectorizePadding;
     this->tilingLevel = options.tilingLevel;
@@ -113,6 +129,7 @@
     this->pad = options.pad;
     this->packPaddings = options.packPaddings;
     this->hoistPaddings = options.hoistPaddings;
+    this->transposePaddings = options.transposePaddings;
     this->packPaddings = options.packPaddings;
     this->scalarizeDynamicDims = options.scalarizeDynamicDims;
     this->generalize = options.generalize;
@@ -147,6 +164,27 @@
 
   void runOnOperation() override;
 };
+
+struct UnrollOneVectorOpPass
+    : public UnrollOneVectorOpBase<UnrollOneVectorOpPass> {
+  UnrollOneVectorOpPass() = default;
+  UnrollOneVectorOpPass(const UnrollOneVectorOpPass &pass) {}
+  void runOnOperation() override;
+};
+
+struct UnrollOneParentLoopPass
+    : public UnrollOneParentLoopBase<UnrollOneParentLoopPass> {
+  UnrollOneParentLoopPass() = default;
+  UnrollOneParentLoopPass(const UnrollOneParentLoopPass &pass) {}
+  void runOnOperation() override;
+};
+
+struct OutlineOneParentLoopPass
+    : public OutlineOneParentLoopBase<OutlineOneParentLoopPass> {
+  OutlineOneParentLoopPass() = default;
+  OutlineOneParentLoopPass(const OutlineOneParentLoopPass &pass) {}
+  void runOnOperation() override;
+};
 }  // namespace
 
 /// Return the neutral element as a new Value.
@@ -176,10 +214,8 @@
     doTiling = true;
     tilingOptions.tileSizes = {tileSizes.begin(), tileSizes.end()};
   }
-  if (!tileInterchange.empty()) {
-    tilingOptions.tileInterchange = {tileInterchange.begin(),
-                                     tileInterchange.end()};
-  }
+  tilingOptions.tileInterchange = {tileInterchange.begin(),
+                                   tileInterchange.end()};
 
   // Set up padding options.
   // TODO: Replace the lambdas by either functions defined in MLIR core or even
@@ -195,10 +231,22 @@
                ? hoistPaddings[opOperand.getOperandNumber()]
                : 0;
   };
+  auto transposeFunc = [&](OpOperand &opOperand) {
+    SmallVector<int64_t> transposeVector = {};
+    if (opOperand.getOperandNumber() >= transposePaddings.size())
+      return transposeVector;
+    SmallVector<StringRef> elems;
+    StringRef(transposePaddings[opOperand.getOperandNumber()])
+        .split(elems, ':');
+    for (StringRef elem : elems)
+      transposeVector.push_back(std::stoi(elem.str()));
+    return transposeVector;
+  };
   LinalgPaddingOptions paddingOptions;
   paddingOptions.setPaddingValueComputationFunction(getNeutralOfLinalgOp);
   paddingOptions.setPaddingNoFoldComputationFunction(packFunc);
   paddingOptions.setPaddingHoistComputationFunction(hoistingFunc);
+  paddingOptions.setPaddingTransposeComputationFunction(transposeFunc);
 
   CodegenStrategy strategy;
   strategy.tileAndFuseIf(doTiling, anchorOpName, tilingOptions)
@@ -209,7 +257,9 @@
   OpPassManager dynamicPM(FuncOp::getOperationName());
   strategy.configurePassPipeline(dynamicPM, funcOp.getContext());
 
-  if (failed(runPipeline(dynamicPM, funcOp))) return signalPassFailure();
+  if (failed(runPipeline(dynamicPM, funcOp))) {
+    return signalPassFailure();
+  }
 }
 
 void LinalgSingleTilingExpertPass::runOnOperation() {
@@ -246,17 +296,29 @@
                ? hoistPaddings[opOperand.getOperandNumber()]
                : 0;
   };
+  auto transposeFunc = [&](OpOperand &opOperand) {
+    SmallVector<int64_t> transposeVector = {};
+    if (opOperand.getOperandNumber() >= transposePaddings.size())
+      return transposeVector;
+    SmallVector<StringRef> elems;
+    StringRef(transposePaddings[opOperand.getOperandNumber()])
+        .split(elems, ':');
+    for (StringRef elem : elems)
+      transposeVector.push_back(std::stoi(elem.str()));
+    return transposeVector;
+  };
   LinalgPaddingOptions paddingOptions;
   paddingOptions.setPaddingValueComputationFunction(getNeutralOfLinalgOp);
   paddingOptions.setPaddingNoFoldComputationFunction(packFunc);
   paddingOptions.setPaddingHoistComputationFunction(hoistingFunc);
+  paddingOptions.setPaddingTransposeComputationFunction(transposeFunc);
 
   CodegenStrategy strategy;
   StringRef genericOpName = GenericOp::getOperationName();
   strategy.tileIf(doTiling, anchorOpName, tilingOptions)
       .padIf(pad, anchorOpName, paddingOptions)
+      .decomposeIf(decomposeToLowerDimOp)
       .generalizeIf(generalize, anchorOpName)
-      // TODO: decomposeToLowerDimIf when the need arises.
       .interchangeIf(!iteratorInterchange.empty(), iteratorInterchange)
       .vectorizeIf(vectorize, generalize ? genericOpName : anchorOpName,
                    nullptr, vectorizePadding);
@@ -264,12 +326,9 @@
   // Created a nested OpPassManager and run.
   OpPassManager dynamicPM(FuncOp::getOperationName());
   strategy.configurePassPipeline(dynamicPM, funcOp.getContext());
-
-  if (decomposeToLowerDimOp) {
-    dynamicPM.addPass(createLinalgStrategyDecomposePass());
+  if (failed(runPipeline(dynamicPM, funcOp))) {
+    return signalPassFailure();
   }
-
-  if (failed(runPipeline(dynamicPM, funcOp))) return signalPassFailure();
 }
 
 void LinalgVectorLoweringPass::runOnOperation() {
@@ -349,7 +408,91 @@
   OpPassManager dynamicPM(FuncOp::getOperationName());
   FuncOp funcOp = getOperation();
   strategy.configurePassPipeline(dynamicPM, funcOp.getContext());
-  if (failed(runPipeline(dynamicPM, funcOp))) return signalPassFailure();
+  if (failed(runPipeline(dynamicPM, funcOp))) {
+    return signalPassFailure();
+  }
+}
+
+void UnrollOneVectorOpPass::runOnOperation() {
+  if (getOperation().getName() != anchorFuncOpName) return;
+
+  MLIRContext *ctx = &getContext();
+  RewritePatternSet patterns(ctx);
+  vector::populateVectorUnrollPatterns(
+      patterns, vector::UnrollVectorOptions()
+                    .setNativeShape(targetShape)
+                    .setFilterConstraint([&](Operation *op) {
+                      auto unrollInterface =
+                          dyn_cast<VectorUnrollOpInterface>(op);
+                      if (!unrollInterface ||
+                          op->getName().getStringRef() != anchorOpName ||
+                          !sourceShape.hasValue() ||
+                          !unrollInterface.getShapeForUnroll().hasValue())
+                        return failure();
+
+                      ArrayRef<int64_t> sourceShapeToMatch{sourceShape};
+                      auto shapeForUnroll =
+                          unrollInterface.getShapeForUnroll().getValue();
+                      ArrayRef<int64_t> actualSourceShape{
+                          shapeForUnroll.begin(), shapeForUnroll.end()};
+                      return success(sourceShapeToMatch == actualSourceShape);
+                    }));
+  vector::populateVectorToVectorCanonicalizationPatterns(patterns);
+  (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
+}
+
+void UnrollOneParentLoopPass::runOnOperation() {
+  if (getOperation().getName() != anchorFuncOpName) return;
+
+  // Poor man's op targeting.
+  getOperation().walk([&](Operation *op) {
+    if (op->getName().getStringRef() != anchorOpName)
+      return WalkResult::advance();
+    SmallVector<scf::ForOp> reverseEnclosingLoops;
+    getAtMostNEnclosingLoops(op, parentLoopNum, reverseEnclosingLoops);
+    if (failed(loopUnrollByFactor(reverseEnclosingLoops.back(), unrollFactor)))
+      signalPassFailure();
+    return WalkResult::interrupt();
+  });
+}
+
+scf::ExecuteRegionOp outlineInExecuteRegion(RewriterBase &b, Operation *op) {
+  if (op->getNumRegions() != 1) return nullptr;
+  OpBuilder::InsertionGuard g(b);
+  b.setInsertionPoint(op);
+  scf::ExecuteRegionOp executeRegionOp =
+      b.create<scf::ExecuteRegionOp>(op->getLoc(), op->getResultTypes());
+  {
+    OpBuilder::InsertionGuard g(b);
+    b.setInsertionPointToStart(&executeRegionOp.getRegion().emplaceBlock());
+    Operation *clonedOp = b.cloneWithoutRegions(*op);
+    Region &clonedRegion = clonedOp->getRegions().front();
+    assert(clonedRegion.empty() && "expected empty region");
+    b.inlineRegionBefore(op->getRegions().front(), clonedRegion,
+                         clonedRegion.end());
+    b.create<scf::YieldOp>(op->getLoc(), clonedOp->getResults());
+  }
+  b.replaceOp(op, executeRegionOp.getResults());
+  return executeRegionOp;
+}
+
+void OutlineOneParentLoopPass::runOnOperation() {
+  if (getOperation().getName() != anchorFuncOpName) return;
+
+  // Poor man's op targeting.
+  getOperation().walk([&](Operation *op) {
+    if (op->getName().getStringRef() != anchorOpName)
+      return WalkResult::advance();
+    SmallVector<scf::ForOp> reverseEnclosingLoops;
+    getAtMostNEnclosingLoops(op, parentLoopNum, reverseEnclosingLoops);
+    IRRewriter b(op->getContext());
+    scf::ExecuteRegionOp exec =
+        outlineInExecuteRegion(b, reverseEnclosingLoops.back());
+    if (failed(outlineSingleBlockRegion(b, op->getLoc(), exec.getRegion(),
+                                        resultFuncName)))
+      signalPassFailure();
+    return WalkResult::interrupt();
+  });
 }
 
 std::unique_ptr<OperationPass<FuncOp>> mlir::createLinalgFusePass() {
@@ -378,6 +521,18 @@
   return std::make_unique<LinalgVectorLoweringPass>(options);
 }
 
+std::unique_ptr<OperationPass<FuncOp>> mlir::createUnrollOneVectorOpPass() {
+  return std::make_unique<UnrollOneVectorOpPass>();
+}
+
+std::unique_ptr<OperationPass<FuncOp>> mlir::createUnrollOneParentLoopPass() {
+  return std::make_unique<UnrollOneParentLoopPass>();
+}
+
+std::unique_ptr<OperationPass<FuncOp>> mlir::createOutlineOneParentLoopPass() {
+  return std::make_unique<OutlineOneParentLoopPass>();
+}
+
 //===----------------------------------------------------------------------===//
 // Transforms
 //===----------------------------------------------------------------------===//

diff --git a/iree/compiler/Codegen/Sandbox/PassDetail.h b/iree/compiler/Codegen/Sandbox/PassDetail.h
index bc4cc6e..4c0e089 100644
--- a/iree/compiler/Codegen/Sandbox/PassDetail.h
+++ b/iree/compiler/Codegen/Sandbox/PassDetail.h

@@ -12,7 +12,7 @@
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Vector/VectorOps.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Pass/Pass.h"
 
 namespace mlir {

diff --git a/iree/compiler/Codegen/Sandbox/Passes.h b/iree/compiler/Codegen/Sandbox/Passes.h
index 2bcf1ff..1878ba8 100644
--- a/iree/compiler/Codegen/Sandbox/Passes.h
+++ b/iree/compiler/Codegen/Sandbox/Passes.h

@@ -20,6 +20,7 @@
   bool pad = false;
   SmallVector<int64_t> packPaddings = {};
   SmallVector<int64_t> hoistPaddings = {};
+  SmallVector<std::string> transposePaddings = {};
   bool vectorize = false;
   bool vectorizePadding = false;
   int64_t tilingLevel = -1;
@@ -40,6 +41,7 @@
   bool pad = false;
   SmallVector<int64_t> packPaddings = {};
   SmallVector<int64_t> hoistPaddings = {};
+  SmallVector<std::string> transposePaddings = {};
   bool scalarizeDynamicDims = false;
   bool generalize = false;
   SmallVector<int64_t> iteratorInterchange = {};
@@ -73,6 +75,16 @@
 std::unique_ptr<OperationPass<FuncOp>> createLinalgVectorLoweringPass(
     const LinalgVectorLoweringPassOptions &options);
 
+/// Create a pass to drive the unrolling of a single vector op.
+std::unique_ptr<OperationPass<FuncOp>> createUnrollOneVectorOpPass();
+
+/// Create a pass to drive the unrolling of a single parent loop of an op.
+std::unique_ptr<OperationPass<FuncOp>> createUnrollOneParentLoopPass();
+
+/// Create a pass to drive the outlining of the region of a single parent loop
+/// of an op.
+std::unique_ptr<OperationPass<FuncOp>> createOutlineOneParentLoopPass();
+
 //===----------------------------------------------------------------------===//
 // Transforms that tie together individual drivers.
 //===----------------------------------------------------------------------===//

diff --git a/iree/compiler/Codegen/Sandbox/Passes.td b/iree/compiler/Codegen/Sandbox/Passes.td
index 2f3dd6c..1c1eabf 100644
--- a/iree/compiler/Codegen/Sandbox/Passes.td
+++ b/iree/compiler/Codegen/Sandbox/Passes.td

@@ -32,13 +32,15 @@
     ListOption<"hoistPaddings", "hoist-paddings", "int64_t",
                "Hoist padding depths.",
                "llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated">,
-
+    ListOption<"transposePaddings", "transpose-paddings", "std::string",
+               "Transpose paddings.",
+               "llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated">,
     // Vectorization options.
     Option<"vectorize", "vectorize", "bool", /*default=*/"false",
       "Rewrite the linalg op as a vector operation.">,
     Option<"vectorizePadding", "vectorize-padding", "bool", /*default=*/"false",
-      "Rewrite all linalg.pad_tensor ops in the function to vector form.">,
-  
+      "Rewrite all tensor.pad ops in the function to vector form.">,
+
     // IREE specific options
     Option<"tilingLevel", "tiling-level", "int64_t", /*default=*/"-1",
       "Use default tiling level used to retrieve the configuration from lowering.config">
@@ -78,6 +80,9 @@
     ListOption<"hoistPaddings", "hoist-paddings", "int64_t",
                "Hoist padding depths.",
                "llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated">,
+    ListOption<"transposePaddings", "transpose-paddings", "std::string",
+               "Transpose paddings.",
+               "llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated">,
     Option<"scalarizeDynamicDims", "scalarize-dynamic-dims", "bool",
       /*default=*/"false", "Tile dynamic dimensions by 1.">,
 
@@ -132,7 +137,7 @@
       [{Split vector transfers between slow (masked) and fast "
         "(unmasked) variants. Possible options are:\n"
           "\tnone [default]: keep unsplit vector.transfer and pay the price\n"
-          "\tlinalg-copy: use linalg.fill + linalg.copy for the slow path\n"
+          "\tlinalg-copy: use linalg.fill + linalg.generic for the slow path\n"
           "\tvector-transfers: use extra small unmasked vector.transfers for"
           " the slow path\n}]>,
     Option<"lowerVectorTransposeTo", "lower-vector-transpose-to",
@@ -169,4 +174,73 @@
   ];
 }
 
+def UnrollOneVectorOp : Pass<"unroll-one-vector-op", "FuncOp"> {
+  let summary = "Pass to unroll a vector op to a target size.";
+  let constructor = "mlir::createUnrollOneVectorOpPass()";
+  let options = [
+    Option<"anchorFuncOpName", "anchor-func", "std::string", /*default=*/"",
+      "Which func op is the anchor to latch on.">,
+    Option<"anchorOpName", "anchor-op", "std::string", /*default=*/"",
+      "Which unique op within the func is the anchor to latch on.">,
+
+    // UnrollOneVectorOp options.
+    ListOption<"sourceShape", "source-shape", "int64_t", "Source vector shape",
+               "llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated">,
+    ListOption<"targetShape", "target-shape", "int64_t", "Target vector shape",
+               "llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated">,
+  ];
+  let dependentDialects = [
+    "::mlir::arith::ArithmeticDialect", "::mlir::AffineDialect",
+    "::mlir::linalg::LinalgDialect", "::mlir::scf::SCFDialect",
+    "::mlir::StandardOpsDialect", "::mlir::tensor::TensorDialect",
+    "::mlir::vector::VectorDialect"
+  ];
+}
+
+def UnrollOneParentLoop : Pass<"unroll-one-parent-loop", "FuncOp"> {
+  let summary = "Pass to unroll the k^th parent loop of an op by some amount.";
+  let constructor = "mlir::createUnrollOneParentLoopPass()";
+  let options = [
+    Option<"anchorFuncOpName", "anchor-func", "std::string", /*default=*/"",
+      "Which func op is the anchor to latch on.">,
+    Option<"anchorOpName", "anchor-op", "std::string", /*default=*/"",
+      "Which unique op within the func is the anchor to latch on.">,
+
+    // UnrollOneParentLoop options.
+    Option<"parentLoopNum", "parent-loop-num", "unsigned", /*default=*/"1",
+      "Number of the parent loop to latch on.">,
+    Option<"unrollFactor", "unroll-factor", "unsigned", /*default=*/"1",
+      "Unroll factor.">,
+  ];
+  let dependentDialects = [
+    "::mlir::arith::ArithmeticDialect", "::mlir::AffineDialect",
+    "::mlir::linalg::LinalgDialect", "::mlir::scf::SCFDialect",
+    "::mlir::StandardOpsDialect", "::mlir::tensor::TensorDialect",
+    "::mlir::vector::VectorDialect"
+  ];
+}
+
+def OutlineOneParentLoop : Pass<"outline-one-parent-loop", "FuncOp"> {
+  let summary = "Pass to outline the k^th parent loop of an op.";
+  let constructor = "mlir::createOutlineOneParentLoopPass()";
+  let options = [
+    Option<"anchorFuncOpName", "anchor-func", "std::string", /*default=*/"",
+      "Which func op is the anchor to latch on.">,
+    Option<"anchorOpName", "anchor-op", "std::string", /*default=*/"",
+      "Which unique op within the func is the anchor to latch on.">,
+
+    // OutlineOneParentLoop options.
+    Option<"parentLoopNum", "parent-loop-num", "unsigned", /*default=*/"1",
+      "Number of the parent loop to latch on.">,
+    Option<"resultFuncName", "result-func-name", "std::string", /*default=*/"",
+      "Name of the func op produced by outlining.">,
+  ];
+  let dependentDialects = [
+    "::mlir::arith::ArithmeticDialect", "::mlir::AffineDialect",
+    "::mlir::linalg::LinalgDialect", "::mlir::scf::SCFDialect",
+    "::mlir::StandardOpsDialect", "::mlir::tensor::TensorDialect",
+    "::mlir::vector::VectorDialect"
+  ];
+}
+
 #endif // IREE_CODEGEN_SANDBOX_PASSES_TD

diff --git a/iree/compiler/Codegen/Sandbox/test/BUILD b/iree/compiler/Codegen/Sandbox/test/BUILD
index 3340d08..55f3025 100644
--- a/iree/compiler/Codegen/Sandbox/test/BUILD
+++ b/iree/compiler/Codegen/Sandbox/test/BUILD

@@ -20,7 +20,9 @@
     srcs = enforce_glob(
         [
             "fusion_expert.mlir",
+            "outline_one_parent_loop.mlir",
             "single_tiling_expert.mlir",
+            "unroll_one_vector_op.mlir",
         ],
         include = ["*.mlir"],
     ),

diff --git a/iree/compiler/Codegen/Sandbox/test/CMakeLists.txt b/iree/compiler/Codegen/Sandbox/test/CMakeLists.txt
index ae4b2eb..5c3e835 100644
--- a/iree/compiler/Codegen/Sandbox/test/CMakeLists.txt
+++ b/iree/compiler/Codegen/Sandbox/test/CMakeLists.txt

@@ -15,7 +15,9 @@
     lit
   SRCS
     "fusion_expert.mlir"
+    "outline_one_parent_loop.mlir"
     "single_tiling_expert.mlir"
+    "unroll_one_vector_op.mlir"
   TOOLS
     FileCheck
     iree::tools::iree-opt

diff --git a/iree/compiler/Codegen/Sandbox/test/outline_one_parent_loop.mlir b/iree/compiler/Codegen/Sandbox/test/outline_one_parent_loop.mlir
new file mode 100644
index 0000000..f6b671d
--- /dev/null
+++ b/iree/compiler/Codegen/Sandbox/test/outline_one_parent_loop.mlir

@@ -0,0 +1,111 @@
+// RUN: iree-opt %s -outline-one-parent-loop="anchor-func=test anchor-op=scf.yield parent-loop-num=1 result-func-name=foo" | FileCheck %s
+// RUN: iree-opt %s -outline-one-parent-loop="anchor-func=matmul anchor-op=vector.contract parent-loop-num=2 result-func-name=bar" | FileCheck %s --check-prefix=MATMUL
+
+// CHECK-LABEL: func @foo
+// CHECK-LABEL: func @test
+func @test(%ub: index, %it: index) -> index {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %res = scf.for %i = %c0 to %ub step %c1 iter_args(%bbit = %it) -> (index) {
+    scf.yield %bbit : index
+  }
+  return %res: index
+}
+
+// MATMUL-LABEL: func @bar
+// MATMUL-LABEL: func @matmul
+func @matmul(%arg0: tensor<24x48xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, %arg1: tensor<48x32xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, %arg2: tensor<24x32xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true}) -> tensor<24x32xf32> attributes {passthrough = ["noinline", ["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} {
+  %c0 = arith.constant 0 : index
+  %c32 = arith.constant 32 : index
+  %c24 = arith.constant 24 : index
+  %c16 = arith.constant 16 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %c8 = arith.constant 8 : index
+  %c48 = arith.constant 48 : index
+  %0 = linalg.init_tensor [2, 2, 8, 32] : tensor<2x2x8x32xf32>
+  %1 = tensor.cast %0 : tensor<2x2x8x32xf32> to tensor<?x?x8x32xf32>
+  %2 = linalg.init_tensor [2, 2, 32, 8] : tensor<2x2x32x8xf32>
+  %3 = tensor.cast %2 : tensor<2x2x32x8xf32> to tensor<?x?x32x8xf32>
+  %4 = scf.for %arg3 = %c0 to %c24 step %c16 iter_args(%arg4 = %arg2) -> (tensor<24x32xf32>) {
+    %5 = affine.min affine_map<(d0) -> (16, -d0 + 24)>(%arg3)
+    %6 = scf.for %arg5 = %c0 to %c32 step %c16 iter_args(%arg6 = %arg4) -> (tensor<24x32xf32>) {
+      %7 = tensor.extract_slice %arg6[%arg3, %arg5] [%5, 16] [1, 1] : tensor<24x32xf32> to tensor<?x16xf32>
+      %8 = scf.for %arg7 = %c0 to %5 step %c8 iter_args(%arg8 = %7) -> (tensor<?x16xf32>) {
+        %13 = affine.min affine_map<(d0, d1) -> (8, -d0 + d1)>(%arg7, %5)
+        %14 = scf.for %arg9 = %c0 to %c16 step %c8 iter_args(%arg10 = %arg8) -> (tensor<?x16xf32>) {
+          %15 = tensor.extract_slice %arg10[%arg7, %arg9] [%13, 8] [1, 1] : tensor<?x16xf32> to tensor<?x8xf32>
+          %16 = linalg.fill(%cst, %15) : f32, tensor<?x8xf32> -> tensor<?x8xf32> 
+          %17 = tensor.insert_slice %16 into %arg10[%arg7, %arg9] [%13, 8] [1, 1] : tensor<?x8xf32> into tensor<?x16xf32>
+          scf.yield %17 : tensor<?x16xf32>
+        }
+        scf.yield %14 : tensor<?x16xf32>
+      }
+      %9 = scf.for %arg7 = %c0 to %5 step %c8 iter_args(%arg8 = %1) -> (tensor<?x?x8x32xf32>) {
+        %13 = affine.apply affine_map<(d0) -> (d0 ceildiv 8)>(%arg7)
+        %14 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg7, %arg3)
+        %15 = affine.min affine_map<(d0, d1) -> (8, -d0 + d1)>(%arg7, %5)
+        %16 = scf.for %arg9 = %c0 to %c48 step %c32 iter_args(%arg10 = %arg8) -> (tensor<?x?x8x32xf32>) {
+          %17 = affine.apply affine_map<(d0) -> (d0 ceildiv 32)>(%arg9)
+          %18 = affine.min affine_map<(d0) -> (32, -d0 + 48)>(%arg9)
+          %19 = tensor.extract_slice %arg0[%14, %arg9] [%15, %18] [1, 1] : tensor<24x48xf32> to tensor<?x?xf32>
+          %20 = vector.transfer_read %19[%c0, %c0], %cst : tensor<?x?xf32>, vector<8x32xf32>
+          %21 = vector.transfer_write %20, %arg10[%13, %17, %c0, %c0] {in_bounds = [true, true]} : vector<8x32xf32>, tensor<?x?x8x32xf32>
+          scf.yield %21 : tensor<?x?x8x32xf32>
+        }
+        scf.yield %16 : tensor<?x?x8x32xf32>
+      }
+      %10 = scf.for %arg7 = %c0 to %c16 step %c8 iter_args(%arg8 = %3) -> (tensor<?x?x32x8xf32>) {
+        %13 = affine.apply affine_map<(d0) -> (d0 ceildiv 8)>(%arg7)
+        %14 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg7, %arg5)
+        %15 = scf.for %arg9 = %c0 to %c48 step %c32 iter_args(%arg10 = %arg8) -> (tensor<?x?x32x8xf32>) {
+          %16 = affine.apply affine_map<(d0) -> (d0 ceildiv 32)>(%arg9)
+          %17 = affine.min affine_map<(d0) -> (32, -d0 + 48)>(%arg9)
+          %18 = tensor.extract_slice %arg1[%arg9, %14] [%17, 8] [1, 1] : tensor<48x32xf32> to tensor<?x8xf32>
+          %19 = vector.transfer_read %18[%c0, %c0], %cst {in_bounds = [false, true]} : tensor<?x8xf32>, vector<32x8xf32>
+          %20 = vector.transfer_write %19, %arg10[%13, %16, %c0, %c0] {in_bounds = [true, true]} : vector<32x8xf32>, tensor<?x?x32x8xf32>
+          scf.yield %20 : tensor<?x?x32x8xf32>
+        }
+        scf.yield %15 : tensor<?x?x32x8xf32>
+      }
+      %11 = scf.for %arg7 = %c0 to %5 step %c8 iter_args(%arg8 = %8) -> (tensor<?x16xf32>) {
+        %13 = affine.min affine_map<(d0, d1) -> (8, -d0 + d1)>(%arg7, %5)
+        %14 = affine.apply affine_map<(d0) -> (d0 ceildiv 8)>(%arg7)
+        %15 = scf.for %arg9 = %c0 to %c16 step %c8 iter_args(%arg10 = %arg8) -> (tensor<?x16xf32>) {
+          %16 = affine.apply affine_map<(d0) -> (d0 ceildiv 8)>(%arg9)
+          %17 = tensor.extract_slice %arg10[%arg7, %arg9] [%13, 8] [1, 1] : tensor<?x16xf32> to tensor<?x8xf32>
+          %18 = vector.transfer_read %17[%c0, %c0], %cst {in_bounds = [false, true]} : tensor<?x8xf32>, vector<8x8xf32>
+          %19 = scf.for %arg11 = %c0 to %c48 step %c32 iter_args(%arg12 = %18) -> (vector<8x8xf32>) {
+            %22 = affine.apply affine_map<(d0) -> (d0 ceildiv 32)>(%arg11)
+            %23 = vector.transfer_read %9[%14, %22, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<?x?x8x32xf32>, vector<8x32xf32>
+            %24 = vector.transfer_read %10[%16, %22, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<?x?x32x8xf32>, vector<32x8xf32>
+            %25 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %23, %24, %arg12 : vector<8x32xf32>, vector<32x8xf32> into vector<8x8xf32>
+            scf.yield %25 : vector<8x8xf32>
+          }
+          %20 = vector.transfer_write %19, %17[%c0, %c0] {in_bounds = [false, true]} : vector<8x8xf32>, tensor<?x8xf32>
+          %21 = tensor.insert_slice %20 into %arg10[%arg7, %arg9] [%13, 8] [1, 1] : tensor<?x8xf32> into tensor<?x16xf32>
+          scf.yield %21 : tensor<?x16xf32>
+        }
+        scf.yield %15 : tensor<?x16xf32>
+      }
+      %12 = tensor.insert_slice %11 into %arg6[%arg3, %arg5] [%5, 16] [1, 1] : tensor<?x16xf32> into tensor<24x32xf32>
+      scf.yield %12 : tensor<24x32xf32>
+    }
+    scf.yield %6 : tensor<24x32xf32>
+  }
+  return %4 : tensor<24x32xf32>
+}
+func private @nano_time() -> i64 attributes {llvm.emit_c_interface}
+func public @main(%arg0: tensor<24x48xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, %arg1: tensor<48x32xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, %arg2: tensor<24x32xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true}, %arg3: memref<?xi64>) -> tensor<24x32xf32> attributes {llvm.emit_c_interface} {
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %0 = memref.dim %arg3, %c0 : memref<?xi64>
+  %1 = scf.for %arg4 = %c0 to %0 step %c1 iter_args(%arg5 = %arg2) -> (tensor<24x32xf32>) {
+    %2 = call @nano_time() : () -> i64
+    %3 = call @matmul(%arg0, %arg1, %arg5) : (tensor<24x48xf32>, tensor<48x32xf32>, tensor<24x32xf32>) -> tensor<24x32xf32>
+    %4 = call @nano_time() : () -> i64
+    %5 = arith.subi %4, %2 : i64
+    memref.store %5, %arg3[%arg4] : memref<?xi64>
+    scf.yield %3 : tensor<24x32xf32>
+  }
+  return %1 : tensor<24x32xf32>
+}

diff --git a/iree/compiler/Codegen/Sandbox/test/unroll_one_vector_op.mlir b/iree/compiler/Codegen/Sandbox/test/unroll_one_vector_op.mlir
new file mode 100644
index 0000000..43825d1
--- /dev/null
+++ b/iree/compiler/Codegen/Sandbox/test/unroll_one_vector_op.mlir

@@ -0,0 +1,19 @@
+// RUN: iree-opt %s -pass-pipeline='builtin.func(unroll-one-vector-op{anchor-func=test anchor-op=vector.contract source-shape=4,4,3 target-shape=2,4,3})' | FileCheck %s
+
+#matmul_accesses = [
+  affine_map<(i, j, k) -> (i, k)>,
+  affine_map<(i, j, k) -> (k, j)>,
+  affine_map<(i, j, k) -> (i, j)>
+]
+#matmul_trait = {
+  indexing_maps = #matmul_accesses,
+  iterator_types = ["parallel", "parallel", "reduction"]
+}
+
+// CHECK-LABEL: func @test
+func @test(%a: vector<4x3xf32>, %b: vector<3x4xf32>, %c: vector<4x4xf32>) -> vector<4x4xf32> {
+  // CHECK: vector.contract {{.*}} : vector<2x3xf32>, vector<3x4xf32> into vector<2x4xf32>
+  // CHECK: vector.contract {{.*}} : vector<2x3xf32>, vector<3x4xf32> into vector<2x4xf32>
+  %d = vector.contract #matmul_trait %a, %b, %c: vector<4x3xf32>, vector<3x4xf32> into vector<4x4xf32>
+  return %d: vector<4x4xf32>
+}

diff --git a/iree/compiler/Codegen/Transforms/Transforms.h b/iree/compiler/Codegen/Transforms/Transforms.h
index c3165e2..305ac8e 100644
--- a/iree/compiler/Codegen/Transforms/Transforms.h
+++ b/iree/compiler/Codegen/Transforms/Transforms.h

@@ -15,7 +15,7 @@
 #include "llvm/Support/Debug.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
-#include "mlir/Dialect/Vector/VectorOps.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/Pass/Pass.h"
 

diff --git a/iree/compiler/Codegen/Utils/BUILD b/iree/compiler/Codegen/Utils/BUILD
index 9a58e65..19bddad 100644
--- a/iree/compiler/Codegen/Utils/BUILD
+++ b/iree/compiler/Codegen/Utils/BUILD

@@ -25,6 +25,7 @@
     deps = [
         "//iree/compiler/Codegen/Interfaces:ProcessorOpInterfaces",
         "//iree/compiler/Dialect/Flow/IR",
+        "//iree/compiler/Dialect/Flow/IR:PartitionableLoopsInterface",
         "//iree/compiler/Dialect/HAL/IR",
         "//llvm-external-projects/iree-dialects:IREELinalgExtDialect",
         "@llvm-project//llvm:Support",

diff --git a/iree/compiler/Codegen/Utils/CMakeLists.txt b/iree/compiler/Codegen/Utils/CMakeLists.txt
index 1211b11..de6f31c 100644
--- a/iree/compiler/Codegen/Utils/CMakeLists.txt
+++ b/iree/compiler/Codegen/Utils/CMakeLists.txt

@@ -28,6 +28,7 @@
     MLIRSupport
     iree::compiler::Codegen::Interfaces::ProcessorOpInterfaces
     iree::compiler::Dialect::Flow::IR
+    iree::compiler::Dialect::Flow::IR::PartitionableLoopsInterface
     iree::compiler::Dialect::HAL::IR
   PUBLIC
 )

diff --git a/iree/compiler/Codegen/Utils/Utils.cpp b/iree/compiler/Codegen/Utils/Utils.cpp
index 3db7144..096ca6f 100644
--- a/iree/compiler/Codegen/Utils/Utils.cpp
+++ b/iree/compiler/Codegen/Utils/Utils.cpp

@@ -51,31 +51,41 @@
   return entryPointOps;
 }
 
+/// Returns the LLVM Target triple associated with the `hal.executable.variant`
+/// operation if set.
+static Optional<llvm::Triple> getTargetTriple(
+    IREE::HAL::ExecutableVariantOp variantOp) {
+  IREE::HAL::ExecutableTargetAttr targetAttr = variantOp.target();
+  if (!targetAttr) return llvm::None;
+  auto config = targetAttr.getConfiguration();
+  if (!config) return llvm::None;
+  auto triple = config.getAs<StringAttr>("target_triple");
+  if (!triple) return llvm::None;
+  return llvm::Triple(triple.getValue().str());
+}
+
+bool isX86(IREE::HAL::ExecutableVariantOp variantOp) {
+  Optional<llvm::Triple> triple = getTargetTriple(variantOp);
+  return triple && triple.getValue().isX86();
+}
+
 //===----------------------------------------------------------------------===//
 // Utility functions to get untiled op shapes
 //===----------------------------------------------------------------------===//
 
-SmallVector<unsigned> getPartitionedLoops(Operation *op) {
-  if (auto mmt4dOp = dyn_cast<linalg::Mmt4DOp>(op)) {
-    return {0, 1};
+SmallVector<int64_t> getDistributedTileSizes(
+    IREE::Flow::PartitionableLoopsInterface interfaceOp,
+    ArrayRef<int64_t> workloadPerWorkgroup) {
+  SmallVector<int64_t> tileSizes(interfaceOp.getNumLoops(), 0);
+  SmallVector<unsigned> partitionableLoops =
+      interfaceOp.getPartitionableLoops(kNumMaxParallelDims);
+  assert(partitionableLoops.size() == workloadPerWorkgroup.size() &&
+         "mismatch in parallelization");
+  for (auto it :
+       llvm::zip(workloadPerWorkgroup, llvm::reverse(partitionableLoops))) {
+    tileSizes[std::get<1>(it)] = std::get<0>(it);
   }
-  if (auto linalgOp = dyn_cast<linalg::LinalgOp>(op)) {
-    SmallVector<unsigned> partitionedLoops;
-    for (auto indexedIterator : llvm::enumerate(linalgOp.iterator_types())) {
-      if (isParallelIterator(indexedIterator.value())) {
-        partitionedLoops.push_back(indexedIterator.index());
-      }
-    }
-    // Only keep the last kNumMaxParallelDims if we have more than that.
-    while (partitionedLoops.size() > kNumMaxParallelDims) {
-      partitionedLoops.erase(partitionedLoops.begin());
-    }
-    return partitionedLoops;
-  }
-  if (auto tilableOp = dyn_cast<IREE::LinalgExt::TiledOpInterface>(op)) {
-    return tilableOp.getPartitionableLoops(kNumMaxParallelDims);
-  }
-  return {};
+  return tileSizes;
 }
 
 /// Walk up the defs of the view, to get the untiled value. Either walks up
@@ -131,7 +141,9 @@
   // should be tiled and the materialized loop nest. The materialized loops'
   // upper bounds should be the original dimension size for the corresponding
   // tiled op shape dimension.
-  auto partitionedLoops = getPartitionedLoops(linalgOp);
+  auto interfaceOp = cast<IREE::Flow::PartitionableLoopsInterface>(*linalgOp);
+  auto partitionedLoops =
+      interfaceOp.getPartitionableLoops(kNumMaxParallelDims);
   SmallVector<LoopTilingAndDistributionInfo> loopInfo =
       getTiledAndDistributedLoopInfo(linalgOp->getParentOfType<FuncOp>());
   // The number of linalg implicit loops to partition and tiled loops
@@ -583,5 +595,30 @@
   return info;
 }
 
+/// Create a linalg::GenericOp version of an n-D copy that can further tile,
+/// lower to loops or vectorize, unlike the current implementation of
+/// memref::CopyOp.
+Operation *createLinalgCopyOp(OpBuilder &b, Location loc, Value from,
+                              Value to) {
+  auto memrefTypeFrom = from.getType().cast<MemRefType>();
+  auto memrefTypeTo = to.getType().cast<MemRefType>();
+  (void)memrefTypeFrom;
+  assert(memrefTypeFrom && memrefTypeTo &&
+         memrefTypeFrom.getRank() == memrefTypeTo.getRank());
+  AffineMap id =
+      AffineMap::getMultiDimIdentityMap(memrefTypeTo.getRank(), b.getContext());
+  SmallVector<StringRef> iteratorTypes(memrefTypeTo.getRank(),
+                                       getParallelIteratorTypeName());
+  return b.create<linalg::GenericOp>(
+      loc,
+      /*inputs=*/from,
+      /*outputs=*/to,
+      /*indexingMaps=*/llvm::makeArrayRef({id, id}),
+      /*iteratorTypes=*/iteratorTypes,
+      [](OpBuilder &b, Location loc, ValueRange args) {
+        b.create<linalg::YieldOp>(loc, args.front());
+      });
+}
+
 }  // namespace iree_compiler
 }  // namespace mlir

diff --git a/iree/compiler/Codegen/Utils/Utils.h b/iree/compiler/Codegen/Utils/Utils.h
index fd62177..8623c87 100644
--- a/iree/compiler/Codegen/Utils/Utils.h
+++ b/iree/compiler/Codegen/Utils/Utils.h

@@ -7,8 +7,10 @@
 #ifndef IREE_COMPILER_CODEGEN_UTILS_UTILS_H_
 #define IREE_COMPILER_CODEGEN_UTILS_UTILS_H_
 
+#include "iree/compiler/Dialect/Flow/IR/PartitionableLoopsInterface.h"
 #include "iree/compiler/Dialect/HAL/IR/HALOps.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/Triple.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/IR/BuiltinOps.h"
@@ -32,6 +34,13 @@
 /// Returns the entry point op for the `funcOp`. Returns `nullptr` on failure.
 IREE::HAL::ExecutableEntryPointOp getEntryPoint(FuncOp funcOp);
 
+/// Methods to get backend information.
+bool isX86(IREE::HAL::ExecutableVariantOp variantOp);
+inline bool isX86(FuncOp entryPointFn) {
+  auto variantOp =
+      entryPointFn->getParentOfType<IREE::HAL::ExecutableVariantOp>();
+  return isX86(variantOp);
+}
 inline bool isVMVXBackend(IREE::HAL::ExecutableVariantOp variantOp) {
   return variantOp.target().getBackend().getValue() == "vmvx";
 }
@@ -59,11 +68,13 @@
 // Utility functions to set configurations
 //===----------------------------------------------------------------------===//
 
-/// Returns the loops that are partitioned during dispatch region formations, in
-/// order, i.e. starting from the outer-most to innermost.
-/// Note that this is the same method that is used at the Flow dispatch region
-/// formation to tile and distribute the ops.
-SmallVector<unsigned> getPartitionedLoops(Operation *op);
+/// Return the tile sizes to use for the Flow partitioned loops given the
+/// workload per workgroup. The tile sizes for the partitioned loops are
+/// obtained from the workload per workgroup. The other loops are returned as
+/// zero.
+SmallVector<int64_t> getDistributedTileSizes(
+    IREE::Flow::PartitionableLoopsInterface interfaceOp,
+    ArrayRef<int64_t> workloadPerWorkgroup);
 
 /// Information about a tiled and distributed loop.
 ///
@@ -146,6 +157,7 @@
 SmallVector<LoopTilingAndDistributionInfo> getTiledAndDistributedLoopInfo(
     FuncOp funcOp);
 
+Operation *createLinalgCopyOp(OpBuilder &b, Location loc, Value from, Value to);
 }  // namespace iree_compiler
 }  // namespace mlir
 

diff --git a/iree/compiler/Dialect/Flow/IR/FlowBase.td b/iree/compiler/Dialect/Flow/IR/FlowBase.td
index 1974663..1f97e58 100644
--- a/iree/compiler/Dialect/Flow/IR/FlowBase.td
+++ b/iree/compiler/Dialect/Flow/IR/FlowBase.td

@@ -67,7 +67,7 @@
 // Base flow dialect op classes
 //===----------------------------------------------------------------------===//
 
-class FLOW_Op<string mnemonic, list<OpTrait> traits = []> :
+class FLOW_Op<string mnemonic, list<Trait> traits = []> :
     Op<Flow_Dialect, mnemonic, traits> {
   let parser = [{ return parse$cppClass(parser, &result); }];
   let printer = [{ return print$cppClass(p, *this); }];

diff --git a/iree/compiler/Dialect/Flow/IR/FlowOps.td b/iree/compiler/Dialect/Flow/IR/FlowOps.td
index 6282f68..ea9ba2e 100644
--- a/iree/compiler/Dialect/Flow/IR/FlowOps.td
+++ b/iree/compiler/Dialect/Flow/IR/FlowOps.td

@@ -17,7 +17,7 @@
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/Interfaces/ViewLikeInterface.td"
 
-class FLOW_PureOp<string mnemonic, list<OpTrait> traits = []> :
+class FLOW_PureOp<string mnemonic, list<Trait> traits = []> :
     FLOW_Op<mnemonic, !listconcat(traits, [NoSideEffect])>;
 
 //===----------------------------------------------------------------------===//

diff --git a/iree/compiler/Dialect/Flow/IR/PartitionableLoopsInterface.cpp b/iree/compiler/Dialect/Flow/IR/PartitionableLoopsInterface.cpp
index 2b63ea2..4941a02 100644
--- a/iree/compiler/Dialect/Flow/IR/PartitionableLoopsInterface.cpp
+++ b/iree/compiler/Dialect/Flow/IR/PartitionableLoopsInterface.cpp

@@ -78,11 +78,7 @@
 
   llvm::SmallVector<unsigned> getPartitionableLoops(
       Operation *op, unsigned maxNumPartitionedLoops) const {
-    llvm::SmallVector<unsigned> partitionableLoops = {0, 1};
-    if (partitionableLoops.size() > maxNumPartitionedLoops) {
-      partitionableLoops.resize(maxNumPartitionedLoops);
-    }
-    return partitionableLoops;
+    return {0, 1};
   }
 };
 
@@ -161,7 +157,6 @@
   ::mlir::linalg::Conv2DOp,
   ::mlir::linalg::Conv3DNdhwcDhwcfOp,
   ::mlir::linalg::Conv3DOp,
-  ::mlir::linalg::CopyOp,
   ::mlir::linalg::DepthwiseConv1DNwcWcOp,
   ::mlir::linalg::DepthwiseConv2DNhwcHwcOp,
   ::mlir::linalg::DepthwiseConv2DNhwcHwcQOp,
@@ -192,9 +187,9 @@
   // clang-format on
 
   registerInterfaceForTiledOpInterfaceOps<
-      LinalgExt::FftOp, LinalgExt::ReverseOp, LinalgExt::ScatterOp,
-      LinalgExt::SortOp, tensor::ExtractSliceOp, tensor::InsertSliceOp>(
-      registry);
+      LinalgExt::FftOp, LinalgExt::ReverseOp, LinalgExt::ScanOp,
+      LinalgExt::ScatterOp, LinalgExt::SortOp, tensor::ExtractSliceOp,
+      tensor::InsertSliceOp>(registry);
 }
 
 }  // namespace Flow

diff --git a/iree/compiler/Dialect/Flow/Transforms/ConvertLinalgMatmulToMmt4D.cpp b/iree/compiler/Dialect/Flow/Transforms/ConvertLinalgMatmulToMmt4D.cpp
index a10e778..08719c5 100644
--- a/iree/compiler/Dialect/Flow/Transforms/ConvertLinalgMatmulToMmt4D.cpp
+++ b/iree/compiler/Dialect/Flow/Transforms/ConvertLinalgMatmulToMmt4D.cpp

@@ -8,6 +8,8 @@
 
 #include "iree/compiler/Dialect/Flow/Transforms/PassDetail.h"
 #include "iree/compiler/Dialect/Flow/Transforms/Passes.h"
+#include "iree/compiler/Utils/CustomKernelsTargetInfo.h"
+#include "llvm/ADT/Optional.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Tensor/Utils/Utils.h"
 #include "mlir/IR/PatternMatch.h"
@@ -215,18 +217,34 @@
          y.getType().cast<ShapedType>().getDimSize(i);
 }
 
+class Mmt4DTileParams {
+ public:
+  Mmt4DTileParams(int64_t M0, int64_t K0, int64_t N0, std::string comment)
+      : M0(M0), K0(K0), N0(N0), comment(comment) {}
+  std::array<int64_t, 2> lhs() const { return {M0, K0}; }
+  std::array<int64_t, 2> rhs() const { return {K0, N0}; }
+  std::array<int64_t, 2> acc() const { return {M0, N0}; }
+  const std::string &getComment() const { return comment; }
+
+ private:
+  const int64_t M0;
+  const int64_t K0;
+  const int64_t N0;
+  const std::string comment;
+};
+
 // Converts linalg.matmul to an equivalent subgraph using linalg.mmt4d.
 // Currently, M0, N0, K0 are compile time constants.
 // TODO(ataei): Move this pattern to linalg transforms upstream.
 class LinalgMatmulOpToLinalgMmt4DOpPattern
     : public OpRewritePattern<linalg::MatmulOp> {
  public:
-  LinalgMatmulOpToLinalgMmt4DOpPattern(MLIRContext *context, int M0, int K0,
-                                       int N0, PatternBenefit benefit = 1)
-      : OpRewritePattern<linalg::MatmulOp>(context, benefit),
-        M0(M0),
-        K0(K0),
-        N0(N0) {}
+  LinalgMatmulOpToLinalgMmt4DOpPattern(
+      MLIRContext *context, const CustomKernelsTargetInfo &target_info,
+      bool enable_generic_slow)
+      : OpRewritePattern<linalg::MatmulOp>(context),
+        target_info(target_info),
+        enable_generic_slow(enable_generic_slow) {}
 
   LogicalResult matchAndRewrite(linalg::MatmulOp matmulOp,
                                 PatternRewriter &rewriter) const override {
@@ -249,23 +267,33 @@
       return failure();
     }
 
-    Value paddedLhs = pad(loc, rewriter, lhs, {M0, K0});
-    Value paddedRhs = pad(loc, rewriter, rhs, {K0, N0});
-    Value paddedAcc = pad(loc, rewriter, acc, {M0, N0});
+    const auto &maybe_tile_params = chooseTileParams(lhs, rhs, acc);
+    if (!maybe_tile_params) {
+      // No good tiling is known for the given problem shape, and the slow
+      // generic fallback (for tests) is not enabled.
+      return failure();
+    }
+    const Mmt4DTileParams &tile_params = maybe_tile_params.getValue();
 
-    Value lhs4D = expandTo4D(loc, rewriter, paddedLhs, {M0, K0});
-    Value rhs4D = expandTo4D(loc, rewriter, paddedRhs, {K0, N0});
-    Value acc4D = expandTo4D(loc, rewriter, paddedAcc, {M0, N0});
+    Value paddedLhs = pad(loc, rewriter, lhs, tile_params.lhs());
+    Value paddedRhs = pad(loc, rewriter, rhs, tile_params.rhs());
+    Value paddedAcc = pad(loc, rewriter, acc, tile_params.acc());
+
+    Value lhs4D = expandTo4D(loc, rewriter, paddedLhs, tile_params.lhs());
+    Value rhs4D = expandTo4D(loc, rewriter, paddedRhs, tile_params.rhs());
+    Value acc4D = expandTo4D(loc, rewriter, paddedAcc, tile_params.acc());
 
     Value lhs4DT = transpose(loc, rewriter, lhs4D, {0, 2, 1, 3});
     Value rhs4DT = transpose(loc, rewriter, rhs4D, {2, 0, 3, 1});
     Value acc4DT = transpose(loc, rewriter, acc4D, {0, 2, 1, 3});
 
-    auto mmt4dResult = rewriter.create<linalg::Mmt4DOp>(
+    auto mmt4d = rewriter.create<linalg::Mmt4DOp>(
         loc, acc4DT.getType(), ValueRange{lhs4DT, rhs4DT}, ValueRange{acc4DT});
+    mmt4d->setAttr(StringAttr::get(getContext(), "comment"),
+                   StringAttr::get(getContext(), tile_params.getComment()));
 
     Value mmt4dResultTransposed =
-        transpose(loc, rewriter, mmt4dResult.getResult(0), {0, 2, 1, 3});
+        transpose(loc, rewriter, mmt4d.getResult(0), {0, 2, 1, 3});
 
     Value paddedResult =
         collapseTo2D(loc, rewriter, mmt4dResultTransposed,
@@ -278,11 +306,32 @@
   }
 
  private:
-  const int M0;
-  const int K0;
-  const int N0;
+  llvm::Optional<Mmt4DTileParams> chooseTileParams(Value lhs, Value rhs,
+                                                   Value acc) const;
+
+  CustomKernelsTargetInfo target_info;
+  bool enable_generic_slow;
 };
 
+llvm::Optional<Mmt4DTileParams>
+LinalgMatmulOpToLinalgMmt4DOpPattern::chooseTileParams(Value lhs, Value rhs,
+                                                       Value acc) const {
+  Type lhsElemType = lhs.getType().cast<ShapedType>().getElementType();
+  Type rhsElemType = rhs.getType().cast<ShapedType>().getElementType();
+  Type accElemType = acc.getType().cast<ShapedType>().getElementType();
+  if (lhsElemType.isSignlessInteger(8) && rhsElemType.isSignlessInteger(8) &&
+      accElemType.isSignlessInteger(32) &&
+      target_info.has(CustomKernelTargetFeature::Aarch64Dotprod)) {
+    return Mmt4DTileParams(8, 4, 8, "i8*i8->i32, aarch64 +dotprod");
+  }
+  if (enable_generic_slow) {
+    return Mmt4DTileParams(8, 2, 4,
+                           "generic tiling parameters, as no known kernel was "
+                           "matched for this matmul and target");
+  }
+  return llvm::None;
+}
+
 /// Canonicalizes [linalg.init_tensor -> linalg.fill -> linalg.generic] ->
 /// [linalg.init_tensor -> linalg.fill] where linalg.generic does only copy e.g
 /// a transpose.
@@ -336,23 +385,7 @@
 
   LogicalResult initializeOptions(StringRef options) override {
     if (failed(Pass::initializeOptions(options))) return failure();
-    auto failureWithMessage = [=](const char *msg) {
-      llvm::errs() << "illegal options `" << options << "` for pass `"
-                   << getArgument() << "`: " << msg << "\n";
-      return failure();
-    };
-    if (M0 == mlir::ShapedType::kDynamicSize ||
-        N0 == mlir::ShapedType::kDynamicSize ||
-        K0 == mlir::ShapedType::kDynamicSize) {
-      return failureWithMessage(
-          "currently all three values M0,K0,N0 must be "
-          "specified as a fixed size value, not 'dynamic', as the heuristic to "
-          "choose these values is not yet implemented.");
-    }
-    if (M0 == 0 || N0 == 0 || K0 == 0) {
-      return failureWithMessage("all three values M0,K0,N0 must be nonzero.");
-    }
-    return success();
+    return ParseCustomKernelsTargetInfo(arch, features, target_info);
   }
 
   void runOnOperation() override {
@@ -360,8 +393,8 @@
     // Main pattern.
     {
       RewritePatternSet patterns(&getContext());
-      patterns.insert<LinalgMatmulOpToLinalgMmt4DOpPattern>(context, M0, K0,
-                                                            N0);
+      patterns.insert<LinalgMatmulOpToLinalgMmt4DOpPattern>(
+          context, target_info, enable_generic_slow);
       if (failed(applyPatternsAndFoldGreedily(getOperation(),
                                               std::move(patterns)))) {
         return signalPassFailure();
@@ -380,6 +413,9 @@
       }
     }
   }
+
+ private:
+  CustomKernelsTargetInfo target_info;
 };
 }  // namespace
 

diff --git a/iree/compiler/Dialect/Flow/Transforms/DispatchLinalgOnTensors.cpp b/iree/compiler/Dialect/Flow/Transforms/DispatchLinalgOnTensors.cpp
index 3c857b7..943acf5 100644
--- a/iree/compiler/Dialect/Flow/Transforms/DispatchLinalgOnTensors.cpp
+++ b/iree/compiler/Dialect/Flow/Transforms/DispatchLinalgOnTensors.cpp

@@ -9,6 +9,7 @@
 #include "iree/compiler/Dialect/Flow/IR/FlowDialect.h"
 #include "iree/compiler/Dialect/Flow/IR/FlowOps.h"
 #include "iree/compiler/Dialect/Flow/IR/FlowTypes.h"
+#include "iree/compiler/Dialect/Flow/IR/PartitionableLoopsInterface.h"
 #include "iree/compiler/Dialect/Flow/Transforms/DestructiveUpdateUtils.h"
 #include "iree/compiler/Dialect/Flow/Transforms/PassDetail.h"
 #include "iree/compiler/Dialect/Flow/Transforms/Passes.h"
@@ -693,31 +694,6 @@
   return success();
 }
 
-/// Returns the loops that are partitioned during dispatch region formations, in
-/// order, i.e. starting from the outer-most to innermost.
-static SmallVector<unsigned> getPartitionedLoops(Operation *op) {
-  if (auto mmt4dOp = dyn_cast<linalg::Mmt4DOp>(op)) {
-    return {0, 1};
-  }
-  if (auto linalgOp = dyn_cast<linalg::LinalgOp>(op)) {
-    SmallVector<unsigned> partitionedLoops;
-    for (auto indexedIterator : llvm::enumerate(linalgOp.iterator_types())) {
-      if (isParallelIterator(indexedIterator.value())) {
-        partitionedLoops.push_back(indexedIterator.index());
-      }
-    }
-    // Only keep the last kNumMaxParallelDims if we have more than that.
-    while (partitionedLoops.size() > kNumMaxParallelDims) {
-      partitionedLoops.erase(partitionedLoops.begin());
-    }
-    return partitionedLoops;
-  }
-  if (auto tilableOp = dyn_cast<IREE::LinalgExt::TiledOpInterface>(op)) {
-    return tilableOp.getPartitionableLoops(kNumMaxParallelDims);
-  }
-  return {};
-}
-
 static bool hasOnlyDimUses(Operation *op) {
   return llvm::all_of(op->getUsers(), [&](Operation *user) {
     return isa<tensor::DimOp>(user);
@@ -768,7 +744,9 @@
     // of the outermost parallel loops that can be distributed.
     Location loc = linalgOp->getLoc();
     SmallVector<Range> loopRanges = linalgOp.createLoopRanges(rewriter, loc);
-    SmallVector<unsigned> partitionedLoops = getPartitionedLoops(linalgOp);
+    SmallVector<unsigned> partitionedLoops =
+        cast<PartitionableLoopsInterface>(linalgOp.getOperation())
+            .getPartitionableLoops(kNumMaxParallelDims);
     SmallVector<Value> count;
     for (auto dim : partitionedLoops) {
       count.push_back(loopRanges[dim].size);
@@ -847,7 +825,9 @@
 
     SmallVector<StringRef> iteratorTypes = tilableOp.getLoopIteratorTypes();
     SmallVector<Range> loopRanges = tilableOp.getIterationDomain(rewriter);
-    SmallVector<unsigned> partitionedLoops = getPartitionedLoops(tilableOp);
+    SmallVector<unsigned> partitionedLoops =
+        cast<PartitionableLoopsInterface>(tilableOp.getOperation())
+            .getPartitionableLoops(kNumMaxParallelDims);
     SmallVector<Value> count;
     for (auto dim : partitionedLoops) {
       count.push_back(loopRanges[dim].size);
@@ -1039,7 +1019,11 @@
   // workgroup size specified by the backend.
   auto tileSizeFn = [&](OpBuilder &builder,
                         Operation *op) -> SmallVector<Value, 4> {
-    SmallVector<unsigned> partitionedLoops = getPartitionedLoops(op);
+    PartitionableLoopsInterface interfaceOp =
+        dyn_cast<PartitionableLoopsInterface>(op);
+    if (!interfaceOp) return {};
+    SmallVector<unsigned> partitionedLoops =
+        interfaceOp.getPartitionableLoops(kNumMaxParallelDims);
     if (partitionedLoops.empty()) return {};
     unsigned maxDepth = partitionedLoops.back() + 1;
 

diff --git a/iree/compiler/Dialect/Flow/Transforms/Passes.h b/iree/compiler/Dialect/Flow/Transforms/Passes.h
index 4fcb2dd..6f6502e 100644
--- a/iree/compiler/Dialect/Flow/Transforms/Passes.h
+++ b/iree/compiler/Dialect/Flow/Transforms/Passes.h

@@ -76,8 +76,8 @@
 // subtensor_insert. This allows lowering the operation into a single kernel.
 std::unique_ptr<Pass> createPadTensorToSubTensorInsertPass();
 
-// Pass to convert a linalg.matmul into linalg.mmt4d given M0, N0 and K0 are
-// compile time constants.
+// Pass to convert a linalg.matmul into linalg.mmt4d given some target ISA
+// information currently passed as pass options.
 std::unique_ptr<OperationPass<FuncOp>> createConvertLinalgMatmulToMmt4DPass();
 
 // Creates a pass to fuse Linalg operations on tensors.

diff --git a/iree/compiler/Dialect/Flow/Transforms/Passes.td b/iree/compiler/Dialect/Flow/Transforms/Passes.td
index 2e8dc08..7fd855f 100644
--- a/iree/compiler/Dialect/Flow/Transforms/Passes.td
+++ b/iree/compiler/Dialect/Flow/Transforms/Passes.td

@@ -109,12 +109,15 @@
   let summary = "Convert linalg.matmul to linalg.mmt4d";
   let constructor = "mlir::iree_compiler::IREE::Flow::createConvertLinalgMatmulToMmt4DPass()";
   let options = [
-    Option<"M0", "M0", "int", /*default=*/"mlir::ShapedType::kDynamicSize",
-           "Specifies an explicit M-axis tile size, overriding the default heuristic.">,
-    Option<"K0", "K0", "int", /*default=*/"mlir::ShapedType::kDynamicSize",
-           "Specifies an explicit K-axis tile size, overriding the default heuristic.">,
-    Option<"N0", "N0", "int", /*default=*/"mlir::ShapedType::kDynamicSize",
-           "Specifies an explicit N-axis tile size, overriding the default heuristic.">,
+    Option<"arch", "arch", "std::string",
+           /*default=*/"",
+           "Target architecture, e.g. aarch64">,
+    Option<"features", "features", "std::string",
+           /*default=*/"",
+           "Additional CPU feature flags, e.g. +dotprod">,
+    Option<"enable_generic_slow", "enable_generic_slow", "bool",
+           /*default=*/"false",
+           "For tests only. Use mmt4d even for cases that are not expected to compile to efficient code by using some arbitrary generic tile shape.">,
   ];
 }
 

diff --git a/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors.mlir b/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors.mlir
index 54e5410..8f503e3 100644
--- a/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors.mlir
+++ b/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors.mlir

@@ -1056,18 +1056,18 @@
 
 // Check that we are distributing along the last three dimensions for NHWC-output pooling op.
 
-func @pooling_nwhc_sum_static(%input: tensor<1x33x33x160xf32>) -> tensor<1x1x1x160xf32> {
+func @pooling_nwhc_sum_static(%input: tensor<1x33x33x160xf32>) -> tensor<1x3x3x160xf32> {
   %cst = arith.constant 0.0 : f32
-  %1 = linalg.init_tensor [1, 1, 1, 160] : tensor<1x1x1x160xf32>
-  %2 = linalg.fill(%cst, %1) : f32, tensor<1x1x1x160xf32> -> tensor<1x1x1x160xf32>
-  %3 = linalg.init_tensor [33, 33] : tensor<33x33xf32>
-  %4 = linalg.pooling_nhwc_sum {dilations = dense<1> : vector<2xi64>, strides = dense<33> : vector<2xi64>} ins(%input, %3 : tensor<1x33x33x160xf32>, tensor<33x33xf32>) outs(%2 : tensor<1x1x1x160xf32>) -> tensor<1x1x1x160xf32>
-  return %4 : tensor<1x1x1x160xf32>
+  %1 = linalg.init_tensor [1, 3, 3, 160] : tensor<1x3x3x160xf32>
+  %2 = linalg.fill(%cst, %1) : f32, tensor<1x3x3x160xf32> -> tensor<1x3x3x160xf32>
+  %3 = linalg.init_tensor [11, 11] : tensor<11x11xf32>
+  %4 = linalg.pooling_nhwc_sum {dilations = dense<1> : vector<2xi64>, strides = dense<11> : vector<2xi64>} ins(%input, %3 : tensor<1x33x33x160xf32>, tensor<11x11xf32>) outs(%2 : tensor<1x3x3x160xf32>) -> tensor<1x3x3x160xf32>
+  return %4 : tensor<1x3x3x160xf32>
 }
 
 // CHECK-LABEL: func @pooling_nwhc_sum_static
 //       CHECK:   flow.dispatch.workgroups
-//  CHECK-NEXT:   (%{{.+}}: !flow.dispatch.tensor<readonly:1x33x33x160xf32>, %[[OUTPUT:.+]]: !flow.dispatch.tensor<writeonly:1x1x1x160xf32>)
+//  CHECK-NEXT:   (%{{.+}}: !flow.dispatch.tensor<readonly:1x33x33x160xf32>, %[[OUTPUT:.+]]: !flow.dispatch.tensor<writeonly:1x3x3x160xf32>)
 //       CHECK:     scf.for %[[Z:.+]] =
 //       CHECK:       scf.for %[[Y:.+]] =
 //       CHECK:         scf.for %[[X:.+]] =
@@ -1197,3 +1197,121 @@
 //      CHECK:   flow.dispatch.workgroups
 // CHECK-SAME:     (%[[ARG0]])
 //      CHECK:     %[[CST:.+]] = arith.constant dense<[21, 42]> : tensor<2xi32>
+
+// -----
+
+func @gemm_unitN(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x1xf32>,
+    %arg2 : tensor<?x1xf32>) -> tensor<?x1xf32> {
+  %0 = linalg.matmul
+      ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x1xf32>)
+      outs(%arg2 : tensor<?x1xf32>) -> tensor<?x1xf32>
+  return %0 : tensor<?x1xf32>
+}
+//      CHECK: func @gemm_unitN(
+// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32>,
+// CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<?x1xf32>,
+// CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: tensor<?x1xf32>)
+//  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//  CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//  CHECK-DAG:   %[[M:.+]] = tensor.dim %[[ARG0:.+]], %[[C0]]
+//      CHECK:   flow.dispatch.workgroups[%[[M]], %[[C1]], %[[C1]]]
+//      CHECK:     scf.for
+//  CHECK-NOT:       scf.for
+//      CHECK:       linalg.matmul
+
+// -----
+
+func @gemm_unitM_unitN(%arg0 : tensor<1x1xf32>, %arg1 : tensor<1x1xf32>,
+    %arg2 : tensor<1x1xf32>) -> tensor<1x1xf32> {
+  %0 = linalg.matmul
+      ins(%arg0, %arg1 : tensor<1x1xf32>, tensor<1x1xf32>)
+      outs(%arg2 : tensor<1x1xf32>) -> tensor<1x1xf32>
+  return %0 : tensor<1x1xf32>
+}
+//     CHECK: func @gemm_unitM_unitN(
+//     CHECK:   %[[C1:.+]] = arith.constant 1 : index
+//     CHECK:   flow.dispatch.workgroups[%[[C1]], %[[C1]], %[[C1]]]
+// CHECK-NOT:     scf.for
+//     CHECK:     linalg.matmul
+
+// -----
+
+func @gemm_unitM(%arg0 : tensor<1x?xf32>, %arg1 : tensor<?x?xf32>,
+    %arg2 : tensor<1x?xf32>) -> tensor<1x?xf32> {
+  %0 = linalg.matmul
+      ins(%arg0, %arg1 : tensor<1x?xf32>, tensor<?x?xf32>)
+      outs(%arg2 : tensor<1x?xf32>) -> tensor<1x?xf32>
+  return %0 : tensor<1x?xf32>
+}
+//     CHECK: func @gemm_unitM(
+// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<1x?xf32>,
+// CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<?x?xf32>,
+// CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: tensor<1x?xf32>)
+//  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//  CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//  CHECK-DAG:   %[[N:.+]] = tensor.dim %[[ARG1:.+]], %[[C1]]
+//      CHECK:   flow.dispatch.workgroups[%[[N]], %[[C1]], %[[C1]]]
+//      CHECK:     scf.for
+//  CHECK-NOT:       scf.for
+//      CHECK:       linalg.matmul
+
+// -----
+
+#map = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>
+func @unit_dim_generic(%arg0 : tensor<1x?x1x1x?x?x1x?xf32>,
+    %arg1 : tensor<1x?x1x1x?x?x1x?xf32>) -> tensor<1x?x1x1x?x?x1x?xf32> {
+  %0 = linalg.generic {
+      indexing_maps = [#map, #map, #map],
+      iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]}
+      ins(%arg0, %arg1 : tensor<1x?x1x1x?x?x1x?xf32>, tensor<1x?x1x1x?x?x1x?xf32>)
+      outs(%arg0 : tensor<1x?x1x1x?x?x1x?xf32>) {
+      ^bb0(%arg2 : f32, %arg3 : f32, %arg4 : f32):
+        %1 = arith.addf %arg2, %arg3 : f32
+        linalg.yield %1 : f32
+      } -> tensor<1x?x1x1x?x?x1x?xf32>
+  return %0 : tensor<1x?x1x1x?x?x1x?xf32>
+}
+//  CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0, s1] -> (s0 * s1)>
+//  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1)[s0] -> (d0, -d1 + s0)>
+//      CHECK: func @unit_dim_generic(
+// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<1x?x1x1x?x?x1x?xf32>
+// CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<1x?x1x1x?x?x1x?xf32>)
+//  CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//  CHECK-DAG:   %[[C4:.+]] = arith.constant 4 : index
+//  CHECK-DAG:   %[[C5:.+]] = arith.constant 5 : index
+//  CHECK-DAG:   %[[C7:.+]] = arith.constant 7 : index
+//  CHECK-DAG:   %[[D1:.+]] = tensor.dim %[[ARG0]], %[[C1]]
+//  CHECK-DAG:   %[[D4:.+]] = tensor.dim %[[ARG0]], %[[C4]]
+//  CHECK-DAG:   %[[D5:.+]] = tensor.dim %[[ARG0]], %[[C5]]
+//  CHECK-DAG:   %[[D7:.+]] = tensor.dim %[[ARG0]], %[[C7]]
+//      CHECK:   flow.dispatch.workgroups[%[[D7]], %[[D5]], %[[D4]]]
+// CHECK-SAME:       (%[[ARG0]], %[[D1]], %[[D4]], %[[D5]], %[[D7]]
+// CHECK-NEXT:      %[[ARG2:[a-zA-Z0-9]+]]: !flow.dispatch.tensor<readwrite:1x?x1x1x?x?x1x?xf32>
+// CHECK-SAME:      %[[ARG3:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:      %[[ARG4:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:      %[[ARG5:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:      %[[ARG6:[a-zA-Z0-9]+]]: index
+//  CHECK-DAG:     %[[WG_SIZE_X:.+]] = flow.dispatch.workgroup.size[0] : index
+//  CHECK-DAG:     %[[WG_SIZE_Y:.+]] = flow.dispatch.workgroup.size[1] : index
+//  CHECK-DAG:     %[[WG_SIZE_Z:.+]] = flow.dispatch.workgroup.size[2] : index
+//  CHECK-DAG:     %[[WG_ID_X:.+]] = flow.dispatch.workgroup.id[0] : index
+//  CHECK-DAG:     %[[WG_COUNT_X:.+]] = flow.dispatch.workgroup.count[0] : index
+//  CHECK-DAG:     %[[WG_ID_Y:.+]] = flow.dispatch.workgroup.id[1] : index
+//  CHECK-DAG:     %[[WG_COUNT_Y:.+]] = flow.dispatch.workgroup.count[1] : index
+//  CHECK-DAG:     %[[WG_ID_Z:.+]] = flow.dispatch.workgroup.id[2] : index
+//  CHECK-DAG:     %[[WG_COUNT_Z:.+]] = flow.dispatch.workgroup.count[2] : index
+//  CHECK-DAG:     %[[LB_Z:.+]] = affine.apply #[[MAP0]]()[%[[WG_ID_Z]], %[[WG_SIZE_Z]]]
+//  CHECK-DAG:     %[[STEP_Z:.+]] = affine.apply #[[MAP0]]()[%[[WG_COUNT_Z]], %[[WG_SIZE_Z]]]
+//      CHECK:     scf.for %[[IV0:.+]] = %[[LB_Z]] to %[[ARG4]] step %[[STEP_Z]]
+//  CHECK-DAG:       %[[LB_Y:.+]] = affine.apply #[[MAP0]]()[%[[WG_ID_Y]], %[[WG_SIZE_Y]]]
+//  CHECK-DAG:       %[[STEP_Y:.+]] = affine.apply #[[MAP0]]()[%[[WG_COUNT_Y]], %[[WG_SIZE_Y]]]
+//      CHECK:       scf.for %[[IV1:.+]] = %[[LB_Y]] to %[[ARG5]] step %[[STEP_Y]]
+//  CHECK-DAG:         %[[LB_X:.+]] = affine.apply #[[MAP0]]()[%[[WG_ID_X]], %[[WG_SIZE_X]]]
+//  CHECK-DAG:         %[[STEP_X:.+]] = affine.apply #[[MAP0]]()[%[[WG_COUNT_X]], %[[WG_SIZE_X]]]
+//      CHECK:         scf.for %[[IV2:.+]] = %[[LB_X]] to %[[ARG6]] step %[[STEP_X]]
+//  CHECK-DAG:           %[[TILE_Z:.+]] = affine.min #[[MAP1]](%[[WG_SIZE_Z]], %[[IV0]])[%[[ARG4]]]
+//  CHECK-DAG:           %[[TILE_Y:.+]] = affine.min #[[MAP1]](%[[WG_SIZE_Y]], %[[IV1]])[%[[ARG5]]]
+//  CHECK-DAG:           %[[TILE_X:.+]] = affine.min #[[MAP1]](%[[WG_SIZE_X]], %[[IV2]])[%[[ARG6]]]
+//      CHECK:           flow.dispatch.tensor.load %[[ARG2]]
+// CHECK-SAME:               offsets = [0, 0, 0, 0, %[[IV0]], %[[IV1]], 0, %[[IV2]]]
+// CHECK-SAME:               sizes = [1, %[[ARG3]], 1, 1, %[[TILE_Z]], %[[TILE_Y]], 1, %[[TILE_X]]]

diff --git a/iree/compiler/Dialect/Flow/Transforms/test/matmul_to_mmt4d.mlir b/iree/compiler/Dialect/Flow/Transforms/test/matmul_to_mmt4d.mlir
index cf1d826..22138ee 100644
--- a/iree/compiler/Dialect/Flow/Transforms/test/matmul_to_mmt4d.mlir
+++ b/iree/compiler/Dialect/Flow/Transforms/test/matmul_to_mmt4d.mlir

@@ -1,4 +1,4 @@
-// RUN: iree-opt -split-input-file --iree-flow-convert-linalg-matmul-to-mmt4d='M0=8 K0=2 N0=4' %s | FileCheck %s
+// RUN: iree-opt -split-input-file --iree-flow-convert-linalg-matmul-to-mmt4d=enable_generic_slow %s | FileCheck %s
 
 func @check_mmt4d_f32_static_nopad(%arg0: tensor<24x8xf32>, %arg1: tensor<8x32xf32>, %arg2: tensor<24x32xf32>) -> tensor<24x32xf32> {
     %0 = linalg.matmul ins(%arg0, %arg1 : tensor<24x8xf32>, tensor<8x32xf32>) outs(%arg2 : tensor<24x32xf32>) -> tensor<24x32xf32>
@@ -38,7 +38,9 @@
 // CHECK-NEXT:    ^bb0(%{{.*}}: f32, %{{.*}}: f32):
 // CHECK-NEXT:          linalg.yield %arg3 : f32
 // CHECK-NEXT:    } -> tensor<3x8x8x4xf32>
-//      CHECK: %[[MMT4D:.+]] = linalg.mmt4d ins(%[[LHS4DT]], %[[RHS4DT]] : tensor<3x4x8x2xf32>, tensor<8x4x4x2xf32>) outs(%[[DST4DT]] : tensor<3x8x8x4xf32>) -> tensor<3x8x8x4xf32>
+//      CHECK: %[[MMT4D:.+]] = linalg.mmt4d
+// CHECK-SAME:    {comment = "generic tiling parameters, as no known kernel was matched for this matmul and target"}
+// CHECK-SAME:    ins(%[[LHS4DT]], %[[RHS4DT]] : tensor<3x4x8x2xf32>, tensor<8x4x4x2xf32>) outs(%[[DST4DT]] : tensor<3x8x8x4xf32>) -> tensor<3x8x8x4xf32>
 //      CHECK: %[[MMT4DT_INIT:.+]] = linalg.init_tensor [3, 8, 8, 4] : tensor<3x8x8x4xf32>
 //      CHECK: %[[MMT4DT:.+]] = linalg.generic
 // CHECK-SAME:    indexing_maps = [#[[MAP0]], #[[MAP1]]]

diff --git a/iree/compiler/Dialect/HAL/IR/HALBase.td b/iree/compiler/Dialect/HAL/IR/HALBase.td
index e5568b0..e11d682 100644
--- a/iree/compiler/Dialect/HAL/IR/HALBase.td
+++ b/iree/compiler/Dialect/HAL/IR/HALBase.td

@@ -792,7 +792,7 @@
 // Base HAL op classes
 //===----------------------------------------------------------------------===//
 
-class HAL_Op<string mnemonic, list<OpTrait> traits = []> :
+class HAL_Op<string mnemonic, list<Trait> traits = []> :
     Op<HAL_Dialect, mnemonic, traits> {
   let parser = [{ return parse$cppClass(parser, &result); }];
   let printer = [{ return print$cppClass(p, *this); }];

diff --git a/iree/compiler/Dialect/HAL/IR/HALDialect.cpp b/iree/compiler/Dialect/HAL/IR/HALDialect.cpp
index 8768c4b..7b01c9d 100644
--- a/iree/compiler/Dialect/HAL/IR/HALDialect.cpp
+++ b/iree/compiler/Dialect/HAL/IR/HALDialect.cpp

@@ -77,7 +77,7 @@
  public:
   using VMConversionDialectInterface::VMConversionDialectInterface;
 
-  OwningModuleRef parseVMImportModule() const override {
+  OwningOpRef<mlir::ModuleOp> parseVMImportModule() const override {
     return mlir::parseSourceString(StringRef(iree_hal_imports_create()->data,
                                              iree_hal_imports_create()->size),
                                    getDialect()->getContext());

diff --git a/iree/compiler/Dialect/HAL/IR/HALOps.td b/iree/compiler/Dialect/HAL/IR/HALOps.td
index aaf061c..25bd39f 100644
--- a/iree/compiler/Dialect/HAL/IR/HALOps.td
+++ b/iree/compiler/Dialect/HAL/IR/HALOps.td

@@ -16,10 +16,10 @@
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/Interfaces/ViewLikeInterface.td"
 
-class HAL_PureOp<string mnemonic, list<OpTrait> traits = []> :
+class HAL_PureOp<string mnemonic, list<Trait> traits = []> :
     HAL_Op<mnemonic, !listconcat(traits, [NoSideEffect])>;
 
-class HAL_MakeTupleOp<string mnemonic, list<OpTrait> traits = []> :
+class HAL_MakeTupleOp<string mnemonic, list<Trait> traits = []> :
     HAL_PureOp<mnemonic, traits>;
 
 //===----------------------------------------------------------------------===//

diff --git a/iree/compiler/Dialect/HAL/Target/CUDA/CMakeLists.txt b/iree/compiler/Dialect/HAL/Target/CUDA/CMakeLists.txt
index 6131c1a..ba20b84 100644
--- a/iree/compiler/Dialect/HAL/Target/CUDA/CMakeLists.txt
+++ b/iree/compiler/Dialect/HAL/Target/CUDA/CMakeLists.txt

@@ -6,42 +6,60 @@
 
 # Doesn't use bazel_to_cmake because of various special logic throughout.
 
-if(NOT "${IREE_TARGET_BACKEND_CUDA}")
+if(NOT IREE_TARGET_BACKEND_CUDA)
   return()
 endif()
 
+if(NOT IREE_CUDA_LIBDEVICE_PATH)
+  message(FATAL_ERROR "No IREE_CUDA_LIBDEVICE_PATH found (should have been set at top level)")
+endif()
+
 iree_add_all_subdirs()
 
-# iree_cc_library(
-#   NAME
-#     CUDA
-#   HDRS
-#     "CUDATarget.h"
-#     "LLVMPasses.h"
-#   SRCS
-#     "CUDATarget.cpp"
-#     "NoLoopUnrollPass.cpp"
-#   DEPS
-#     LLVMAnalysis
-#     LLVMBitReader
-#     LLVMCore
-#     LLVMipo
-#     LLVMLinker
-#     LLVMNVPTXCodeGen
-#     LLVMSupport
-#     LLVMTarget
-#     MLIRGPUOps
-#     MLIRLLVMIR
-#     MLIRLLVMToLLVMIRTranslation
-#     MLIRNVVMIR
-#     MLIRNVVMToLLVMIRTranslation
-#     MLIRPass
-#     MLIRSupport
-#     MLIRTargetLLVMIRExport
-#     iree::base::internal::flatcc::building
-#     iree::compiler::Codegen::LLVMGPU
-#     iree::compiler::Dialect::HAL::Target
-#     iree::compiler::Utils
-#     iree::schemas::cuda_executable_def_c_fbs
-#   PUBLIC
-# )
+iree_c_embed_data(
+  NAME
+    cuda_libdevice
+  SRCS
+    "${IREE_CUDA_LIBDEVICE_PATH}"
+  C_FILE_OUTPUT
+    "libdevice.c"
+  H_FILE_OUTPUT
+    "libdevice.h"
+  FLATTEN
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
+    CUDA
+  HDRS
+    "CUDATarget.h"
+    "LLVMPasses.h"
+  SRCS
+    "CUDATarget.cpp"
+    "NoLoopUnrollPass.cpp"
+  DEPS
+    ::cuda_libdevice
+    LLVMAnalysis
+    LLVMBitReader
+    LLVMCore
+    LLVMipo
+    LLVMLinker
+    LLVMNVPTXCodeGen
+    LLVMSupport
+    LLVMTarget
+    MLIRGPUOps
+    MLIRLLVMIR
+    MLIRLLVMToLLVMIRTranslation
+    MLIRNVVMIR
+    MLIRNVVMToLLVMIRTranslation
+    MLIRPass
+    MLIRSupport
+    MLIRTargetLLVMIRExport
+    iree::base::internal::flatcc::building
+    iree::compiler::Codegen::LLVMGPU
+    iree::compiler::Dialect::HAL::Target
+    iree::compiler::Utils
+    iree::schemas::cuda_executable_def_c_fbs
+  PUBLIC
+)

diff --git a/iree/compiler/Dialect/HAL/Target/CUDA/CUDATarget.cpp b/iree/compiler/Dialect/HAL/Target/CUDA/CUDATarget.cpp
index 41d10ad..2541cbc 100644
--- a/iree/compiler/Dialect/HAL/Target/CUDA/CUDATarget.cpp
+++ b/iree/compiler/Dialect/HAL/Target/CUDA/CUDATarget.cpp

@@ -9,6 +9,7 @@
 #include "iree/compiler/Codegen/Dialect/IREECodegenDialect.h"
 #include "iree/compiler/Codegen/Passes.h"
 #include "iree/compiler/Dialect/HAL/Target/CUDA/LLVMPasses.h"
+#include "iree/compiler/Dialect/HAL/Target/CUDA/libdevice.h"
 #include "iree/compiler/Dialect/HAL/Target/TargetRegistry.h"
 #include "iree/compiler/Utils/FlatbufferUtils.h"
 #include "iree/schemas/cuda_executable_def_builder.h"

diff --git a/iree/compiler/Dialect/HAL/Target/CUDA/test/BUILD b/iree/compiler/Dialect/HAL/Target/CUDA/test/BUILD
deleted file mode 100644
index ac1f0e0..0000000
--- a/iree/compiler/Dialect/HAL/Target/CUDA/test/BUILD
+++ /dev/null

@@ -1,26 +0,0 @@
-# Copyright 2021 The IREE Authors
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-# load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
-# load("//build_tools/bazel:enforce_glob.bzl", "enforce_glob")
-
-package(
-    default_visibility = ["//visibility:public"],
-    features = ["layering_check"],
-    licenses = ["notice"],  # Apache 2.0
-)
-
-# iree_lit_test_suite(
-#     name = "lit",
-#     srcs = enforce_glob(
-#         ["smoketest.mlir"],
-#         include = ["*.mlir"],
-#     ),
-#     tools = [
-#         "//iree/tools:iree-opt",
-#         "@llvm-project//llvm:FileCheck",
-#     ],
-# )

diff --git a/iree/compiler/Dialect/HAL/Target/CUDA/test/CMakeLists.txt b/iree/compiler/Dialect/HAL/Target/CUDA/test/CMakeLists.txt
index 0b6559b..31c1f25 100644
--- a/iree/compiler/Dialect/HAL/Target/CUDA/test/CMakeLists.txt
+++ b/iree/compiler/Dialect/HAL/Target/CUDA/test/CMakeLists.txt

@@ -1,13 +1,11 @@
-################################################################################
-# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
-# iree/compiler/Dialect/HAL/Target/CUDA/test/BUILD                             #
-#                                                                              #
-# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
-# CMake-only content.                                                          #
-#                                                                              #
-# To disable autogeneration for this file entirely, delete this header.        #
-################################################################################
-
 iree_add_all_subdirs()
 
-### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
+iree_lit_test_suite(
+  NAME
+    lit
+  SRCS
+    "smoketest.mlir"
+  TOOLS
+    FileCheck
+    iree::tools::iree-opt
+)

diff --git a/iree/compiler/Dialect/HAL/Target/MetalSPIRV/MetalSPIRVTarget.cpp b/iree/compiler/Dialect/HAL/Target/MetalSPIRV/MetalSPIRVTarget.cpp
index 6919875..ebcb6e3 100644
--- a/iree/compiler/Dialect/HAL/Target/MetalSPIRV/MetalSPIRVTarget.cpp
+++ b/iree/compiler/Dialect/HAL/Target/MetalSPIRV/MetalSPIRVTarget.cpp

@@ -18,7 +18,7 @@
 #include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVOps.h"
 #include "mlir/Dialect/SPIRV/IR/TargetAndABI.h"
-#include "mlir/Dialect/Vector/VectorOps.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Target/SPIRV/Serialization.h"
 
 namespace mlir {

diff --git a/iree/compiler/Dialect/HAL/Utils/BUILD b/iree/compiler/Dialect/HAL/Utils/BUILD
index 9ea8bb6..a0febfd 100644
--- a/iree/compiler/Dialect/HAL/Utils/BUILD
+++ b/iree/compiler/Dialect/HAL/Utils/BUILD

@@ -12,11 +12,16 @@
 
 cc_library(
     name = "Utils",
+    srcs = [
+        "InferCustomKernelsTargetInfoFromParent.cpp",
+    ],
     hdrs = [
         "DeviceSwitchBuilder.h",
+        "InferCustomKernelsTargetInfoFromParent.h",
     ],
     deps = [
         "//iree/compiler/Dialect/HAL/IR",
+        "//iree/compiler/Utils",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:StandardOps",

diff --git a/iree/compiler/Dialect/HAL/Utils/CMakeLists.txt b/iree/compiler/Dialect/HAL/Utils/CMakeLists.txt
index ddb2b88..643b46b 100644
--- a/iree/compiler/Dialect/HAL/Utils/CMakeLists.txt
+++ b/iree/compiler/Dialect/HAL/Utils/CMakeLists.txt

@@ -15,6 +15,9 @@
     Utils
   HDRS
     "DeviceSwitchBuilder.h"
+    "InferCustomKernelsTargetInfoFromParent.h"
+  SRCS
+    "InferCustomKernelsTargetInfoFromParent.cpp"
   DEPS
     LLVMSupport
     MLIRIR
@@ -22,6 +25,7 @@
     MLIRSupport
     MLIRTransforms
     iree::compiler::Dialect::HAL::IR
+    iree::compiler::Utils
   PUBLIC
 )
 

diff --git a/iree/compiler/Dialect/HAL/Utils/InferCustomKernelsTargetInfoFromParent.cpp b/iree/compiler/Dialect/HAL/Utils/InferCustomKernelsTargetInfoFromParent.cpp
new file mode 100644
index 0000000..accabdd
--- /dev/null
+++ b/iree/compiler/Dialect/HAL/Utils/InferCustomKernelsTargetInfoFromParent.cpp

@@ -0,0 +1,54 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/compiler/Dialect/HAL/Utils/InferCustomKernelsTargetInfoFromParent.h"
+
+#include "iree/compiler/Dialect/HAL/IR/HALOps.h"
+#include "iree/compiler/Utils/CustomKernelsTargetInfo.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace mlir {
+namespace iree_compiler {
+
+LogicalResult InferCustomKernelsTargetInfoFromParent(
+    FuncOp entryPointFn, CustomKernelsTargetInfo &target_info) {
+  // Set the out-value to defaults early so that early returns produce
+  // consistent results and so that we can write simpler code below
+  // (for loop OR-ing booleans, assuming initial 'false' value).
+  target_info = CustomKernelsTargetInfo();
+
+  // Try to find the parent ExecutableVariantOp and its relevant attributes.
+  auto variantOp =
+      entryPointFn->getParentOfType<IREE::HAL::ExecutableVariantOp>();
+  if (!variantOp) {
+    return failure();
+  }
+  IREE::HAL::ExecutableTargetAttr targetAttr = variantOp.target();
+  if (!targetAttr) {
+    return failure();
+  }
+  auto config = targetAttr.getConfiguration();
+  if (!config) {
+    return failure();
+  }
+  auto tripleAttr = config.getAs<StringAttr>("target_triple");
+  if (!tripleAttr) {
+    return failure();
+  }
+  auto cpuFeaturesAttr = config.getAs<StringAttr>("cpu_features");
+  if (!cpuFeaturesAttr) {
+    return failure();
+  }
+
+  // Exactly the implementation of llvm::Triple::getArchName, skipping all the
+  // parsing work of constructing a llvm::Triple from a string.
+  llvm::StringRef archName(tripleAttr.getValue().split('-').first);
+  llvm::StringRef featuresStr(cpuFeaturesAttr.getValue());
+  return ParseCustomKernelsTargetInfo(archName, featuresStr, target_info);
+}
+
+}  // namespace iree_compiler
+}  // namespace mlir

diff --git a/iree/compiler/Dialect/HAL/Utils/InferCustomKernelsTargetInfoFromParent.h b/iree/compiler/Dialect/HAL/Utils/InferCustomKernelsTargetInfoFromParent.h
new file mode 100644
index 0000000..e7fe181
--- /dev/null
+++ b/iree/compiler/Dialect/HAL/Utils/InferCustomKernelsTargetInfoFromParent.h

@@ -0,0 +1,27 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_COMPILER_DIALECT_HAL_UTILS_INFERCUSTOMKERNELSTARGETINFOFROMPARENT_H_
+#define IREE_COMPILER_DIALECT_HAL_UTILS_INFERCUSTOMKERNELSTARGETINFOFROMPARENT_H_
+
+#include <stdint.h>
+
+#include <cassert>
+
+#include "iree/compiler/Utils/CustomKernelsTargetInfo.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Support/LogicalResult.h"
+
+namespace mlir {
+namespace iree_compiler {
+
+LogicalResult InferCustomKernelsTargetInfoFromParent(
+    FuncOp entryPointFn, CustomKernelsTargetInfo &target_info);
+
+}  // namespace iree_compiler
+}  // namespace mlir
+
+#endif  // IREE_COMPILER_DIALECT_HAL_UTILS_INFERCUSTOMKERNELSTARGETINFOFROMPARENT_H_

diff --git a/iree/compiler/Dialect/Modules/Check/IR/CheckDialect.cpp b/iree/compiler/Dialect/Modules/Check/IR/CheckDialect.cpp
index 4d0da79..645e000 100644
--- a/iree/compiler/Dialect/Modules/Check/IR/CheckDialect.cpp
+++ b/iree/compiler/Dialect/Modules/Check/IR/CheckDialect.cpp

@@ -25,7 +25,7 @@
  public:
   using VMConversionDialectInterface::VMConversionDialectInterface;
 
-  OwningModuleRef parseVMImportModule() const override {
+  OwningOpRef<mlir::ModuleOp> parseVMImportModule() const override {
     return mlir::parseSourceString(StringRef(iree_check_imports_create()->data,
                                              iree_check_imports_create()->size),
                                    getDialect()->getContext());

diff --git a/iree/compiler/Dialect/Modules/VMVX/IR/VMVXBase.td b/iree/compiler/Dialect/Modules/VMVX/IR/VMVXBase.td
index 74465d7..c1b67e9 100644
--- a/iree/compiler/Dialect/Modules/VMVX/IR/VMVXBase.td
+++ b/iree/compiler/Dialect/Modules/VMVX/IR/VMVXBase.td

@@ -79,7 +79,7 @@
   ];
 }
 
-class VMVX_Op<string mnemonic, list<OpTrait> traits = []> :
+class VMVX_Op<string mnemonic, list<Trait> traits = []> :
     Op<VMVX_Dialect, mnemonic, !listconcat(traits, [])> {}
 
 #endif  // IREE_DIALECT_MODULES_VMVX_BASE

diff --git a/iree/compiler/Dialect/Modules/VMVX/IR/VMVXDialect.cpp b/iree/compiler/Dialect/Modules/VMVX/IR/VMVXDialect.cpp
index 511a280..0c54851 100644
--- a/iree/compiler/Dialect/Modules/VMVX/IR/VMVXDialect.cpp
+++ b/iree/compiler/Dialect/Modules/VMVX/IR/VMVXDialect.cpp

@@ -27,7 +27,7 @@
  public:
   using VMConversionDialectInterface::VMConversionDialectInterface;
 
-  OwningModuleRef parseVMImportModule() const override {
+  OwningOpRef<mlir::ModuleOp> parseVMImportModule() const override {
     return mlir::parseSourceString(StringRef(iree_vmvx_imports_create()->data,
                                              iree_vmvx_imports_create()->size),
                                    getDialect()->getContext());

diff --git a/iree/compiler/Dialect/Modules/VMVX/IR/VMVXOps.td b/iree/compiler/Dialect/Modules/VMVX/IR/VMVXOps.td
index 8b01d9e..d5aadaf 100644
--- a/iree/compiler/Dialect/Modules/VMVX/IR/VMVXOps.td
+++ b/iree/compiler/Dialect/Modules/VMVX/IR/VMVXOps.td

@@ -11,7 +11,7 @@
 include "mlir/IR/OpAsmInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 
-class VMVX_PureOp<string mnemonic, list<OpTrait> traits = []> :
+class VMVX_PureOp<string mnemonic, list<Trait> traits = []> :
     VMVX_Op<mnemonic, !listconcat(traits, [NoSideEffect])>;
 
 //===----------------------------------------------------------------------===//

diff --git a/iree/compiler/Dialect/Modules/VMVX/Transforms/Passes.cpp b/iree/compiler/Dialect/Modules/VMVX/Transforms/Passes.cpp
index c4ab9bc..ab76fae 100644
--- a/iree/compiler/Dialect/Modules/VMVX/Transforms/Passes.cpp
+++ b/iree/compiler/Dialect/Modules/VMVX/Transforms/Passes.cpp

@@ -48,6 +48,7 @@
   // Linalg -> SCF.
   nestedModulePM.addNestedPass<FuncOp>(
       IREE::LinalgExt::createLinalgExtToLoopsPass());
+  nestedModulePM.addNestedPass<FuncOp>(createMemrefCopyToLinalgPass());
   nestedModulePM.addNestedPass<FuncOp>(createConvertLinalgToLoopsPass());
   nestedModulePM.addNestedPass<FuncOp>(createCanonicalizerPass());
   nestedModulePM.addNestedPass<FuncOp>(createCSEPass());
@@ -57,7 +58,7 @@
   nestedModulePM.addNestedPass<FuncOp>(memref::createExpandOpsPass());
 
   // Handle tensor-type constants.
-  nestedModulePM.addPass(createTensorConstantBufferizePass());
+  nestedModulePM.addPass(arith::createConstantBufferizePass());
   nestedModulePM.addPass(createFoldTensorExtractOpPass());
 
   // Flatten and cleanup memrefs.

diff --git a/iree/compiler/Dialect/Stream/IR/StreamBase.td b/iree/compiler/Dialect/Stream/IR/StreamBase.td
index e7574a7..bc62c2f 100644
--- a/iree/compiler/Dialect/Stream/IR/StreamBase.td
+++ b/iree/compiler/Dialect/Stream/IR/StreamBase.td

@@ -107,7 +107,7 @@
 // Base stream dialect op classes
 //===----------------------------------------------------------------------===//
 
-class Stream_Op<string mnemonic, list<OpTrait> traits = []> :
+class Stream_Op<string mnemonic, list<Trait> traits = []> :
     Op<Stream_Dialect, mnemonic, traits> {
   let parser = [{ return parse$cppClass(parser, &result); }];
   let printer = [{ return print$cppClass(p, *this); }];

diff --git a/iree/compiler/Dialect/Stream/IR/StreamOps.td b/iree/compiler/Dialect/Stream/IR/StreamOps.td
index b647a6e..213e757 100644
--- a/iree/compiler/Dialect/Stream/IR/StreamOps.td
+++ b/iree/compiler/Dialect/Stream/IR/StreamOps.td

@@ -17,7 +17,7 @@
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/Interfaces/ViewLikeInterface.td"
 
-class Stream_PureOp<string mnemonic, list<OpTrait> traits = []> :
+class Stream_PureOp<string mnemonic, list<Trait> traits = []> :
     Stream_Op<mnemonic, !listconcat(traits, [NoSideEffect])>;
 
 //===----------------------------------------------------------------------===//

diff --git a/iree/compiler/Dialect/Stream/Transforms/MaterializeBuiltins.cpp b/iree/compiler/Dialect/Stream/Transforms/MaterializeBuiltins.cpp
index 3f21edb..4f4a3bc 100644
--- a/iree/compiler/Dialect/Stream/Transforms/MaterializeBuiltins.cpp
+++ b/iree/compiler/Dialect/Stream/Transforms/MaterializeBuiltins.cpp

@@ -19,7 +19,7 @@
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/Dialect/Vector/VectorOps.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypes.h"

diff --git a/iree/compiler/Dialect/Util/IR/UtilOps.td b/iree/compiler/Dialect/Util/IR/UtilOps.td
index 2cfe2c6..a5d5ea2 100644
--- a/iree/compiler/Dialect/Util/IR/UtilOps.td
+++ b/iree/compiler/Dialect/Util/IR/UtilOps.td

@@ -21,13 +21,13 @@
 // Op types
 //===----------------------------------------------------------------------===//
 
-class Util_Op<string mnemonic, list<OpTrait> traits = []> :
+class Util_Op<string mnemonic, list<Trait> traits = []> :
     Op<Util_Dialect, mnemonic, traits> {
   let parser = [{ return parse$cppClass(parser, result); }];
   let printer = [{ print$cppClass(p, *this); }];
 }
 
-class Util_PureOp<string mnemonic, list<OpTrait> traits = []> :
+class Util_PureOp<string mnemonic, list<Trait> traits = []> :
     Util_Op<mnemonic, !listconcat(traits, [NoSideEffect])>;
 
 //===----------------------------------------------------------------------===//

diff --git a/iree/compiler/Dialect/Util/Tools/StructAttrGen.cpp b/iree/compiler/Dialect/Util/Tools/StructAttrGen.cpp
index 1e5be8b..d1be382 100644
--- a/iree/compiler/Dialect/Util/Tools/StructAttrGen.cpp
+++ b/iree/compiler/Dialect/Util/Tools/StructAttrGen.cpp

@@ -4,6 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+#include "iree/compiler/Utils/StringUtils.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
@@ -334,17 +335,6 @@
   os << "}\n\n";
 }
 
-// Replaces all occurrences of `match` in `str` with `substitute`.
-static std::string replaceAllSubstrs(std::string str, const std::string &match,
-                                     const std::string &substitute) {
-  std::string::size_type scanLoc = 0, matchLoc = std::string::npos;
-  while ((matchLoc = str.find(match, scanLoc)) != std::string::npos) {
-    str = str.replace(matchLoc, match.size(), substitute);
-    scanLoc = matchLoc + substitute.size();
-  }
-  return str;
-}
-
 static void emitTypedFactoryDef(const StructAttr &structAttr, raw_ostream &os) {
   os << "// static\n";
   os << formatv("{0} {0}::get(", structAttr.getStructClassName());
@@ -366,7 +356,8 @@
     // wrapping quotes.
     std::string builderTemplate = type.getConstBuilderTemplate().str();
     if (StringRef(builderTemplate).contains("\"$0\"")) {
-      builderTemplate = replaceAllSubstrs(builderTemplate, "\"$0\"", "$0");
+      builderTemplate = mlir::iree_compiler::replaceAllSubstrs(builderTemplate,
+                                                               "\"$0\"", "$0");
     }
 
     os << formatv("  auto {0}Attr = {1};\n", field.getName(),

diff --git a/iree/compiler/Dialect/VM/Conversion/ConversionDialectInterface.h b/iree/compiler/Dialect/VM/Conversion/ConversionDialectInterface.h
index 465b199..d6e030f 100644
--- a/iree/compiler/Dialect/VM/Conversion/ConversionDialectInterface.h
+++ b/iree/compiler/Dialect/VM/Conversion/ConversionDialectInterface.h

@@ -51,11 +51,11 @@
 
  protected:
   // Parses the vm.import module to be cached by the caller.
-  virtual OwningModuleRef parseVMImportModule() const = 0;
+  virtual OwningOpRef<mlir::ModuleOp> parseVMImportModule() const = 0;
 
  private:
   mutable std::once_flag importParseFlag;
-  mutable OwningModuleRef importModuleRef;
+  mutable OwningOpRef<mlir::ModuleOp> importModuleRef;
 };
 
 }  // namespace iree_compiler

diff --git a/iree/compiler/Dialect/VM/IR/VMBase.td b/iree/compiler/Dialect/VM/IR/VMBase.td
index 466fdf2..d32d730 100644
--- a/iree/compiler/Dialect/VM/IR/VMBase.td
+++ b/iree/compiler/Dialect/VM/IR/VMBase.td

@@ -187,7 +187,7 @@
   ];
 }
 
-class VM_Op<string mnemonic, list<OpTrait> traits = []> :
+class VM_Op<string mnemonic, list<Trait> traits = []> :
     Op<VM_Dialect, mnemonic,
        !listconcat(traits, [VM_OpInterface])> {
   let parser = [{ return parse$cppClass(parser, &result); }];

diff --git a/iree/compiler/Dialect/VM/IR/VMOps.td b/iree/compiler/Dialect/VM/IR/VMOps.td
index 5507629..3bceb38 100644
--- a/iree/compiler/Dialect/VM/IR/VMOps.td
+++ b/iree/compiler/Dialect/VM/IR/VMOps.td

@@ -15,7 +15,7 @@
 include "mlir/IR/SymbolInterfaces.td"
 include "iree/compiler/Dialect/VM/IR/VMBase.td"
 
-class VM_PureOp<string mnemonic, list<OpTrait> traits = []> :
+class VM_PureOp<string mnemonic, list<Trait> traits = []> :
     VM_Op<mnemonic, !listconcat(traits, [NoSideEffect])>;
 
 //===----------------------------------------------------------------------===//
@@ -295,7 +295,7 @@
 
 // TODO(b/142336293): DCE globals when unused and not exports. Note that side
 // effects may need to be observed.
-class VM_GlobalOp<string mnemonic, Attr attr_type, list<OpTrait> traits = []> :
+class VM_GlobalOp<string mnemonic, Attr attr_type, list<Trait> traits = []> :
     VM_Op<mnemonic, !listconcat(traits, [
       IsolatedFromAbove,
       HasParent<"IREE::VM::ModuleOp">,
@@ -443,7 +443,7 @@
   let verifier = [{ return verifyGlobalAddressOp(*this); }];
 }
 
-class VM_GlobalLoadOp<Type type, string mnemonic, list<OpTrait> traits = []> :
+class VM_GlobalLoadOp<Type type, string mnemonic, list<Trait> traits = []> :
     VM_Op<mnemonic, !listconcat(traits, [
       DeclareOpInterfaceMethods<VM_SerializableOpInterface>,
       // HACK: works around the lack of symbol side effects in C++.
@@ -462,7 +462,7 @@
 }
 
 class VM_GlobalLoadPrimitiveOp<Type type, string mnemonic, VM_OPC opcode,
-                               list<OpTrait> traits = []> :
+                               list<Trait> traits = []> :
     VM_GlobalLoadOp<type, mnemonic, traits> {
   let description = [{
     Loads the value of a global containing an primitive value.
@@ -475,7 +475,7 @@
   ];
 }
 
-class VM_GlobalStoreOp<Type type, string mnemonic, list<OpTrait> traits = []> :
+class VM_GlobalStoreOp<Type type, string mnemonic, list<Trait> traits = []> :
     VM_Op<mnemonic, !listconcat(traits, [
       DeclareOpInterfaceMethods<VM_SerializableOpInterface>,
     ])> {
@@ -490,7 +490,7 @@
 }
 
 class VM_GlobalStorePrimitiveOp<Type type, string mnemonic, VM_OPC opcode,
-                                list<OpTrait> traits = []> :
+                                list<Trait> traits = []> :
     VM_GlobalStoreOp<type, mnemonic, traits> {
   let description = [{
     Stores a primitive value value to a global.
@@ -504,7 +504,7 @@
 }
 
 class VM_GlobalLoadIndirectOp<Type type, string mnemonic,
-                              list<OpTrait> traits = []> :
+                              list<Trait> traits = []> :
     VM_Op<mnemonic, !listconcat(traits, [
       DeclareOpInterfaceMethods<VM_SerializableOpInterface>,
     ])> {
@@ -520,7 +520,7 @@
 
 class VM_GlobalLoadIndirectPrimitiveOp<Type type, string mnemonic,
                                        VM_OPC opcode,
-                                       list<OpTrait> traits = []> :
+                                       list<Trait> traits = []> :
     VM_GlobalLoadIndirectOp<type, mnemonic, traits> {
   let description = [{
     Loads the value of a global containing a primitive value.
@@ -534,7 +534,7 @@
 }
 
 class VM_GlobalStoreIndirectOp<Type type, string mnemonic,
-                               list<OpTrait> traits = []> :
+                               list<Trait> traits = []> :
     VM_Op<mnemonic, !listconcat(traits, [
       DeclareOpInterfaceMethods<VM_SerializableOpInterface>,
     ])> {
@@ -548,7 +548,7 @@
 
 class VM_GlobalStoreIndirectPrimitiveOp<Type type, string mnemonic,
                                         VM_OPC opcode,
-                                        list<OpTrait> traits = []> :
+                                        list<Trait> traits = []> :
     VM_GlobalStoreIndirectOp<type, mnemonic, traits> {
   let description = [{
     Stores a primitive value to a global.
@@ -751,7 +751,7 @@
 // Constants
 //===----------------------------------------------------------------------===//
 
-class VM_ConstOp<string mnemonic, string ctype, list<OpTrait> traits = []> :
+class VM_ConstOp<string mnemonic, string ctype, list<Trait> traits = []> :
     VM_PureOp<mnemonic, !listconcat(traits, [
       ConstantLike,
       DeclareOpInterfaceMethods<VM_SerializableOpInterface>,
@@ -774,7 +774,7 @@
 
 class VM_ConstantPrimitiveOp<Type type, int width, string mnemonic,
                              VM_OPC opcode,
-                             string ctype, list<OpTrait> traits = []> :
+                             string ctype, list<Trait> traits = []> :
     VM_ConstOp<mnemonic, ctype, traits> {
   let description = [{
     Defines a constant value that is treated as a scalar literal at runtime.
@@ -844,7 +844,7 @@
 }
 
 class VM_ConstantPrimitiveZeroOp<Type type, string mnemonic, VM_OPC opcode,
-                                 string ctype, list<OpTrait> traits = []> :
+                                 string ctype, list<Trait> traits = []> :
     VM_ConstOp<mnemonic, ctype, traits> {
   let description = [{
     Defines a constant zero primitive.
@@ -1210,7 +1210,7 @@
 }
 
 class VM_BufferFillOp<Type type, string mnemonic, VM_OPC opcode,
-                      list<OpTrait> traits = []> :
+                      list<Trait> traits = []> :
     VM_Op<mnemonic, !listconcat(traits, [
       DeclareOpInterfaceMethods<VM_SerializableOpInterface>,
       MemoryEffects<[MemWrite]>,
@@ -1269,7 +1269,7 @@
 }
 
 class VM_BufferLoadOp<Type type, string mnemonic, VM_OPC opcode,
-                      list<OpTrait> traits = []> :
+                      list<Trait> traits = []> :
     VM_PureOp<mnemonic, !listconcat(traits, [
       DeclareOpInterfaceMethods<VM_SerializableOpInterface>,
       MemoryEffects<[MemRead]>,
@@ -1302,7 +1302,7 @@
 }
 
 class VM_BufferStoreOp<Type type, string mnemonic, VM_OPC opcode,
-                       list<OpTrait> traits = []> :
+                       list<Trait> traits = []> :
     VM_Op<mnemonic, !listconcat(traits, [
       DeclareOpInterfaceMethods<VM_SerializableOpInterface>,
       MemoryEffects<[MemWrite]>,
@@ -1511,7 +1511,7 @@
 }
 
 class VM_ListGetPrimitiveOp<Type type, string mnemonic, VM_OPC opcode,
-                            list<OpTrait> traits = []> :
+                            list<Trait> traits = []> :
     VM_PureOp<mnemonic, !listconcat(traits, [
       DeclareOpInterfaceMethods<VM_SerializableOpInterface>,
       MemoryEffects<[MemRead]>,
@@ -1540,7 +1540,7 @@
 }
 
 class VM_ListSetPrimitiveOp<Type type, string mnemonic, VM_OPC opcode,
-                            list<OpTrait> traits = []> :
+                            list<Trait> traits = []> :
     VM_Op<mnemonic, !listconcat(traits, [
       DeclareOpInterfaceMethods<VM_SerializableOpInterface>,
       MemoryEffects<[MemWrite]>,
@@ -1655,7 +1655,7 @@
 //===----------------------------------------------------------------------===//
 
 class VM_SelectPrimitiveOp<Type type, string mnemonic, VM_OPC opcode,
-                           list<OpTrait> traits = []> :
+                           list<Trait> traits = []> :
     VM_PureOp<mnemonic, !listconcat(traits, [
       DeclareOpInterfaceMethods<VM_SerializableOpInterface>,
       AllTypesMatch<["true_value", "false_value", "result"]>,
@@ -1751,7 +1751,7 @@
 }
 
 class VM_SwitchIntegerOp<I type, string mnemonic, VM_OPC opcode,
-                         list<OpTrait> traits = []> :
+                         list<Trait> traits = []> :
     VM_PureOp<mnemonic, !listconcat(traits, [
       DeclareOpInterfaceMethods<VM_SerializableOpInterface>,
       AllTypesMatch<["default_value", "result"]>,
@@ -1790,7 +1790,7 @@
 }
 
 class VM_SwitchFloatOp<F type, string mnemonic, VM_OPC opcode,
-                       list<OpTrait> traits = []> :
+                       list<Trait> traits = []> :
     VM_PureOp<mnemonic, !listconcat(traits, [
       DeclareOpInterfaceMethods<VM_SerializableOpInterface>,
       AllTypesMatch<["default_value", "result"]>,
@@ -1898,7 +1898,7 @@
 //===----------------------------------------------------------------------===//
 
 class VM_UnaryArithmeticOp<Type type, string mnemonic, VM_OPC opcode,
-                           list<OpTrait> traits = []> :
+                           list<Trait> traits = []> :
     VM_PureOp<mnemonic, !listconcat(traits, [
       DeclareOpInterfaceMethods<VM_SerializableOpInterface>,
       AllTypesMatch<["operand", "result"]>,
@@ -1920,7 +1920,7 @@
 }
 
 class VM_BinaryArithmeticOp<Type type, string mnemonic, VM_OPC opcode,
-                            list<OpTrait> traits = []> :
+                            list<Trait> traits = []> :
     VM_PureOp<mnemonic, !listconcat(traits, [
       DeclareOpInterfaceMethods<VM_SerializableOpInterface>,
       AllTypesMatch<["lhs", "rhs", "result"]>,
@@ -1944,7 +1944,7 @@
 }
 
 class VM_TernaryArithmeticOp<Type type, string mnemonic, VM_OPC opcode,
-                             list<OpTrait> traits = []> :
+                             list<Trait> traits = []> :
     VM_PureOp<mnemonic, !listconcat(traits, [
       DeclareOpInterfaceMethods<VM_SerializableOpInterface>,
       AllTypesMatch<["a", "b", "c", "result"]>,
@@ -2404,7 +2404,7 @@
 //===----------------------------------------------------------------------===//
 
 class VM_ShiftArithmeticOp<I type, string mnemonic, VM_OPC opcode,
-                           list<OpTrait> traits = []> :
+                           list<Trait> traits = []> :
     VM_PureOp<mnemonic, !listconcat(traits, [
       DeclareOpInterfaceMethods<VM_SerializableOpInterface>,
       AllTypesMatch<["operand", "result"]>,
@@ -2469,7 +2469,7 @@
 //===----------------------------------------------------------------------===//
 
 class VM_ConversionOp<Type src_type, Type dst_type, string mnemonic,
-                      VM_OPC opcode, list<OpTrait> traits = []> :
+                      VM_OPC opcode, list<Trait> traits = []> :
     VM_PureOp<mnemonic, !listconcat(traits, [
       DeclareOpInterfaceMethods<VM_SerializableOpInterface>,
     ])> {
@@ -2490,7 +2490,7 @@
 }
 
 class VM_ConversionPseudoOp<Type src_type, Type dst_type, string mnemonic,
-                            list<OpTrait> traits = []> :
+                            list<Trait> traits = []> :
     VM_PureOp<mnemonic, traits> {
   let arguments = (ins
     src_type:$operand
@@ -2680,7 +2680,7 @@
 //===----------------------------------------------------------------------===//
 
 class VM_UnaryComparisonOp<Type type, string mnemonic, VM_OPC opcode,
-                           list<OpTrait> traits = []> :
+                           list<Trait> traits = []> :
     VM_PureOp<mnemonic, !listconcat(traits, [
       DeclareOpInterfaceMethods<VM_SerializableOpInterface>,
     ])> {
@@ -2705,7 +2705,7 @@
 }
 
 class VM_UnaryComparisonPseudoOp<Type type, string mnemonic,
-                                 list<OpTrait> traits = []> :
+                                 list<Trait> traits = []> :
     VM_PureOp<mnemonic, !listconcat(traits, [
       VM_PseudoOp,
     ])> {
@@ -2724,7 +2724,7 @@
 }
 
 class VM_BinaryComparisonOp<Type type, string mnemonic, VM_OPC opcode,
-                            list<OpTrait> traits = []> :
+                            list<Trait> traits = []> :
     VM_PureOp<mnemonic, !listconcat(traits, [
       DeclareOpInterfaceMethods<VM_SerializableOpInterface>,
       AllTypesMatch<["lhs", "rhs"]>,
@@ -2752,7 +2752,7 @@
 }
 
 class VM_BinaryComparisonPseudoOp<Type type, string mnemonic,
-                                  list<OpTrait> traits = []> :
+                                  list<Trait> traits = []> :
     VM_PureOp<mnemonic, !listconcat(traits, [
       AllTypesMatch<["lhs", "rhs"]>,
       VM_PseudoOp,
@@ -3469,7 +3469,7 @@
   let hasCanonicalizer = 1;
 }
 
-class VM_CallBaseOp<string mnemonic, list<OpTrait> traits = []> :
+class VM_CallBaseOp<string mnemonic, list<Trait> traits = []> :
     VM_Op<mnemonic, !listconcat(traits, [
       DeclareOpInterfaceMethods<VM_SerializableOpInterface>,
       DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
@@ -3737,7 +3737,7 @@
   let hasCanonicalizer = 1;
 }
 
-class VM_CheckOp<string mnemonic, list<OpTrait> traits = []> :
+class VM_CheckOp<string mnemonic, list<Trait> traits = []> :
     VM_Op<mnemonic, !listconcat(traits, [
       VM_PseudoOp,
     ])> {
@@ -3759,7 +3759,7 @@
   }];
 }
 
-class VM_UnaryCheckOp<string mnemonic, list<OpTrait> traits = []> :
+class VM_UnaryCheckOp<string mnemonic, list<Trait> traits = []> :
     VM_CheckOp<mnemonic, traits> {
   let arguments = (ins
     VM_AnyType:$value,
@@ -3781,7 +3781,7 @@
   ];
 }
 
-class VM_BinaryCheckOp<string mnemonic, list<OpTrait> traits = []> :
+class VM_BinaryCheckOp<string mnemonic, list<Trait> traits = []> :
     VM_CheckOp<mnemonic, !listconcat(traits, [
       AllTypesMatch<["lhs", "rhs"]>,
     ])> {

diff --git a/iree/compiler/InputConversion/MHLO/MHLOToMHLOPreprocessing.cpp b/iree/compiler/InputConversion/MHLO/MHLOToMHLOPreprocessing.cpp
index a7516cf..1477bd1 100644
--- a/iree/compiler/InputConversion/MHLO/MHLOToMHLOPreprocessing.cpp
+++ b/iree/compiler/InputConversion/MHLO/MHLOToMHLOPreprocessing.cpp

@@ -79,7 +79,6 @@
       .getResult();
 }
 
-
 class ExtractConvOpPaddingAttributes : public OpRewritePattern<mhlo::ConvOp> {
  public:
   using OpRewritePattern<mhlo::ConvOp>::OpRewritePattern;
@@ -595,6 +594,86 @@
   }
 };
 
+class ScatterRank0Value : public OpRewritePattern<mhlo::ScatterOp> {
+ public:
+  using OpRewritePattern<mhlo::ScatterOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mhlo::ScatterOp op,
+                                PatternRewriter &rewriter) const override {
+    auto operand = op.operand();
+    auto indices = op.scatter_indices();
+    auto updates = op.updates();
+
+    auto operandTy = operand.getType().dyn_cast<RankedTensorType>();
+    auto indicesTy = indices.getType().dyn_cast<RankedTensorType>();
+    auto updatesTy = updates.getType().dyn_cast<RankedTensorType>();
+    if (!operandTy || !indicesTy || !updatesTy) return failure();
+
+    if (indicesTy.getRank() != 1 || !indicesTy.hasStaticShape() ||
+        updatesTy.getRank() != 0) {
+      return failure();
+    }
+
+    auto dimNumbers = op.scatter_dimension_numbers();
+
+    // We only have one dim for shape so this should be 0.
+    if (dimNumbers.getIndexVectorDim() != 0) return failure();
+
+    // Require canonicalize scatter order.
+    // TODO(suderman): Transpose to canonicalized order.
+    for (auto en : llvm::enumerate(dimNumbers.getScatterDimsToOperandDims())) {
+      if (en.index() != en.value()) return failure();
+    }
+
+    // Inserted window dims should be in order. Technically we just need to
+    // check they are all contained.
+    for (auto en : llvm::enumerate(dimNumbers.getInsertedWindowDims())) {
+      if (en.index() != en.value()) return failure();
+    }
+
+    // This should be empty
+    if (dimNumbers.getUpdateWindowDims().size() != 0) {
+      return failure();
+    }
+
+    // Reshape indices to add the implicit 1x out front.
+    llvm::SmallVector<int64_t> newIndicesShape;
+    llvm::SmallVector<Value> newIndicesDynDims;
+    newIndicesShape.push_back(1);
+    for (auto it : llvm::enumerate(indicesTy.getShape())) {
+      auto dim = it.value();
+      newIndicesShape.push_back(dim);
+    }
+
+    Location loc = op.getLoc();
+    Value reshapedIndices = rewriter.create<mhlo::ReshapeOp>(
+        loc, RankedTensorType::get(newIndicesShape, indicesTy.getElementType()),
+        indices);
+
+    Value reshapedUpdates = rewriter.create<mhlo::ReshapeOp>(
+        loc, RankedTensorType::get({1}, updatesTy.getElementType()), updates);
+
+    SmallVector<int64_t> insertedWindowDims =
+        llvm::to_vector<4>(llvm::seq<int64_t>(0, operandTy.getRank()));
+    SmallVector<int64_t> scatterDimsToOperandDims =
+        llvm::to_vector<4>(llvm::seq<int64_t>(0, operandTy.getRank()));
+    auto newDimNumbers = mhlo::ScatterDimensionNumbersAttr::get(
+        op.getContext(), {}, insertedWindowDims, scatterDimsToOperandDims,
+        /*indexVectorDim=*/1);
+
+    auto newScatter = rewriter.create<mhlo::ScatterOp>(
+        loc, op.getType(), operand, reshapedIndices, reshapedUpdates,
+        newDimNumbers, op.indices_are_sorted(), op.unique_indices());
+
+    Region &region = newScatter.update_computation();
+    rewriter.cloneRegionBefore(op.update_computation(), region, region.end());
+
+    rewriter.replaceOp(op, newScatter.getResult());
+
+    return success();
+  }
+};
+
 // Generates Gaussian noise with uniform random generator based on Box-Muller
 // transform.
 class ExpandRngNormal : public OpRewritePattern<mhlo::RngNormalOp> {
@@ -780,8 +859,10 @@
     mhlo::PopulateUnfuseBatchNormPatterns(context, &patterns);
     mhlo::PopulateComplexLoweringPatterns(context, &patterns);
     mhlo::PopulateGatherToTorchIndexSelectPatterns(context, &patterns);
-    patterns.insert<ExtractReduceWindowOpPaddingAttributes,
-                    AdjustDepthwiseFilterShape, ExpandRngNormal>(context);
+    patterns
+        .insert<ExtractReduceWindowOpPaddingAttributes,
+                AdjustDepthwiseFilterShape, ScatterRank0Value, ExpandRngNormal>(
+            context);
 
     // dot_general canoncalization patterns.
     mhlo::PopulateGeneralDotOpLoweringPatterns(&patterns, context);

diff --git a/iree/compiler/InputConversion/MHLO/test/mhlo_to_mhlo_preprocessing.mlir b/iree/compiler/InputConversion/MHLO/test/mhlo_to_mhlo_preprocessing.mlir
index 55d378c..c57e5c0 100644
--- a/iree/compiler/InputConversion/MHLO/test/mhlo_to_mhlo_preprocessing.mlir
+++ b/iree/compiler/InputConversion/MHLO/test/mhlo_to_mhlo_preprocessing.mlir

@@ -322,3 +322,19 @@
 // CHECK:         %[[SLICE:.+]] = tensor.extract_slice %[[CON]][0] [15] [1] : tensor<16xf32> to tensor<15xf32>
 // CHECK:         %[[RES:.+]] = "mhlo.reshape"(%[[SLICE]]) : (tensor<15xf32>) -> tensor<3x5xf32>
 // CHECK:         return %[[RES]]
+
+// -----
+
+func @scatter_rank0(%arg0: tensor<5x5xi32>, %arg1: tensor<2xi32>, %arg2: tensor<i32>) -> tensor<5x5xi32> {
+  %0 = "mhlo.scatter"(%arg0, %arg1, %arg2) ({
+  ^bb0(%arg3: tensor<i32>, %arg4: tensor<i32>):
+    "mhlo.return"(%arg4) : (tensor<i32>) -> ()
+  }) {indices_are_sorted = true, scatter_dimension_numbers = #mhlo.scatter<inserted_window_dims = [0], scatter_dims_to_operand_dims = [0]>, unique_indices = true} : (tensor<5x5xi32>, tensor<2xi32>, tensor<i32>) -> tensor<5x5xi32>
+  return %0 : tensor<5x5xi32>
+}
+
+// CHECK-LABEL: func @scatter_rank0
+// CHECK-DAG: %[[RE_I:.+]] = "mhlo.reshape"(%arg1) : (tensor<2xi32>) -> tensor<1x2xi32>
+// CHECK-DAG: %[[RE_U:.+]] = "mhlo.reshape"(%arg2) : (tensor<i32>) -> tensor<1xi32>
+// CHECK:     %[[SCATTER:.+]] = "mhlo.scatter"(%arg0, %[[RE_I]], %[[RE_U]])
+// CHECK:       "mhlo.return"(%arg4)

diff --git a/iree/compiler/Translation/test/BUILD b/iree/compiler/Translation/test/BUILD
index 77ccb64..165e90f 100644
--- a/iree/compiler/Translation/test/BUILD
+++ b/iree/compiler/Translation/test/BUILD

@@ -21,6 +21,7 @@
         [
             "hal_executable.mlir",
             "smoketest.mlir",
+            "streams.mlir",
         ],
         include = ["*.mlir"],
     ),

diff --git a/iree/compiler/Translation/test/CMakeLists.txt b/iree/compiler/Translation/test/CMakeLists.txt
index cdc563b..8fa58ce 100644
--- a/iree/compiler/Translation/test/CMakeLists.txt
+++ b/iree/compiler/Translation/test/CMakeLists.txt

@@ -16,6 +16,7 @@
   SRCS
     "hal_executable.mlir"
     "smoketest.mlir"
+    "streams.mlir"
   TOOLS
     FileCheck
     iree::tools::iree-translate

diff --git a/iree/compiler/Translation/test/streams.mlir b/iree/compiler/Translation/test/streams.mlir
new file mode 100644
index 0000000..a7a0a59
--- /dev/null
+++ b/iree/compiler/Translation/test/streams.mlir

@@ -0,0 +1,217 @@
+// RUN: iree-translate -split-input-file -iree-hal-target-backends=vmvx -iree-mlir-to-vm-bytecode-module -iree-vm-bytecode-module-output-format=flatbuffer-text %s -print-ir-after=iree-vm-ordinal-allocation 2>&1 | FileCheck %s
+
+// This file has a few test programs that show how to mix `flow` dispatches into
+// those created by the `linalg` dispatch region formation: the idea is to use
+// any normal IREE input (mhlo/tosa/linalg/etc) on tensors and then also include
+// `flow.dispatch` ops calling `stream.executable`s. `flow.executable`s could be
+// used too but currently have some ergonomics issues that need to be resolved;
+// the improved version of `flow.dispatch` (and `flow.dispatch.workgroups`) will
+// be made part of the public `iree` dialect at which time this file will change
+// to using that. The `flow`/`stream` dialects are generally not considered
+// stable.
+
+// A simple element-wise multiply of two static tensors:
+//   %ret0 = %arg0 * %arg1
+//
+// The host code performs the dispatch with a workload of 4x1x1 - how many
+// workgroups that gets distributed across is left to the HAL backend to decide
+// based on the target device and how the work is tiled.
+//
+// The device code in the stream.executable is tiled - but does not need to be:
+// the only thing we care about at this level is the bindings and any operands
+// that may need to be passed from host->device.
+
+// CHECK-LABEL: vm.module public @e2e
+module @e2e {
+// CHECK: vm.rodata private @executable_0_vmvx_bytecode_fb
+stream.executable private @executable_0 {
+  stream.executable.export public @dispatch
+  builtin.module {
+    func @dispatch(%arg0: !stream.binding, %arg1: !stream.binding, %ret0: !stream.binding) {
+      %c0 = arith.constant 0 : index
+      %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
+      %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
+      %2 = stream.binding.subspan %ret0[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
+      %c4 = arith.constant 4 : index
+      %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
+      %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
+      %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
+      %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
+      %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
+      scf.for %arg3 = %3 to %c4 step %4 {
+        %5 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 4)>(%arg3)[%workgroup_size_0]
+        %6 = flow.dispatch.tensor.load %0, offsets = [%arg3], sizes = [%5], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<?xf32>
+        %7 = flow.dispatch.tensor.load %1, offsets = [%arg3], sizes = [%5], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<?xf32>
+        %8 = linalg.init_tensor [%5] : tensor<?xf32>
+        %9 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6, %7 : tensor<?xf32>, tensor<?xf32>) outs(%8 : tensor<?xf32>) attrs =  {name = "mul.1"} {
+        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
+          %10 = arith.mulf %arg4, %arg5 : f32
+          linalg.yield %10 : f32
+        } -> tensor<?xf32>
+        flow.dispatch.tensor.store %9, %2, offsets = [%arg3], sizes = [%5], strides = [1] : tensor<?xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
+      }
+      return
+    }
+  }
+}
+// CHECK: vm.func private @simple_mul
+func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  // CHECK: vm.call @hal.command_buffer.dispatch
+  %ret0 = flow.dispatch @executable_0::@dispatch[%c4, %c1, %c1](%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  return %ret0 : tensor<4xf32>
+}
+}  // module
+
+// -----
+
+// The same element-wise multiply but now in-place:
+//   %arg0 = %arg0 * %arg1
+//
+// In-place operations can often introduce false dependencies between dispatches
+// and should be avoided at this level in most cases - there's currently no cost
+// model for making dispatches into in-place operations but it's something that
+// would happen in the stream dialect after scheduling: two dispatches known to
+// not be running concurrently and operating on the same resources could be made
+// in-place.
+
+// CHECK-LABEL: vm.module public @inplace
+module @inplace {
+// CHECK: vm.rodata private @executable_1_vmvx_bytecode_fb
+stream.executable private @executable_1 {
+  stream.executable.export public @dispatch
+  builtin.module {
+    func @dispatch(%arg0: !stream.binding, %arg1: !stream.binding) {
+      %c0 = arith.constant 0 : index
+      %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:4xf32>
+      %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
+      %c4 = arith.constant 4 : index
+      %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
+      %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
+      %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
+      %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
+      %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
+      scf.for %arg3 = %3 to %c4 step %4 {
+        %5 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 4)>(%arg3)[%workgroup_size_0]
+        %6 = flow.dispatch.tensor.load %0, offsets = [%arg3], sizes = [%5], strides = [1] : !flow.dispatch.tensor<readwrite:4xf32> -> tensor<?xf32>
+        %7 = flow.dispatch.tensor.load %1, offsets = [%arg3], sizes = [%5], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<?xf32>
+        %8 = linalg.init_tensor [%5] : tensor<?xf32>
+        %9 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6, %7 : tensor<?xf32>, tensor<?xf32>) outs(%8 : tensor<?xf32>) attrs =  {name = "mul.1"} {
+        ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
+          %10 = arith.mulf %arg4, %arg5 : f32
+          linalg.yield %10 : f32
+        } -> tensor<?xf32>
+        flow.dispatch.tensor.store %9, %0, offsets = [%arg3], sizes = [%5], strides = [1] : tensor<?xf32> -> !flow.dispatch.tensor<readwrite:4xf32>
+      }
+      return
+    }
+  }
+}
+// CHECK: vm.func private @simple_mul_inplace
+func @simple_mul_inplace(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  // CHECK: vm.call @hal.command_buffer.dispatch
+  %ret0 = flow.dispatch @executable_1::@dispatch[%c4, %c1, %c1](%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> %arg0
+  return %ret0 : tensor<4xf32>
+}
+}  // module
+
+// -----
+
+// The same element-wise multiply but now with dynamic shapes:
+//   %ret0 = %arg0 * %arg1
+//
+// This shows how the shape dimensions are captured by the dispatch so that the
+// host knows the shapes of the tensors and how the dimensions are passed as
+// operands to the executable for association. Once we perform the host/device
+// split the association allows tensor.dim ops in the device code to query the
+// dynamic dimensions without needing to insert new host -> device transfers.
+// Note that because of this explicit association the order of the dispatch
+// operands doesn't matter as walking the SSA use-def chain up to the
+// stream.binding.subspan allows them to be resolved directly.
+
+// CHECK-LABEL: vm.module public @dynamic
+module @dynamic {
+// CHECK: vm.rodata private @executable_2_vmvx_bytecode_fb
+stream.executable private @executable_2 {
+  stream.executable.export public @dispatch
+  builtin.module {
+    func @dispatch(%arg0: !stream.binding, %arg0_dim0: index, %arg1: !stream.binding, %arg1_dim0: index, %ret0: !stream.binding) {
+      %c0 = arith.constant 0 : index
+      %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:?xf32>{%arg0_dim0}
+      %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:?xf32>{%arg1_dim0}
+      %2 = stream.binding.subspan %ret0[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:?xf32>{%arg0_dim0}
+      %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
+      %workgroup_id_0 = flow.dispatch.workgroup.id[0] : index
+      %workgroup_count_0 = flow.dispatch.workgroup.count[0] : index
+      %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_0, %workgroup_size_0]
+      %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_0, %workgroup_size_0]
+      scf.for %arg5 = %3 to %arg0_dim0 step %4 {
+        %5 = affine.min affine_map<(d0)[s0, s1] -> (s1, -d0 + s0)>(%arg5)[%arg0_dim0, %workgroup_size_0]
+        %6 = flow.dispatch.tensor.load %0, offsets = [%arg5], sizes = [%5], strides = [1] : !flow.dispatch.tensor<readonly:?xf32>{%arg0_dim0} -> tensor<?xf32>
+        %7 = flow.dispatch.tensor.load %1, offsets = [%arg5], sizes = [%5], strides = [1] : !flow.dispatch.tensor<readonly:?xf32>{%arg1_dim0} -> tensor<?xf32>
+        %8 = linalg.init_tensor [%5] : tensor<?xf32>
+        %9 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6, %7 : tensor<?xf32>, tensor<?xf32>) outs(%8 : tensor<?xf32>) attrs =  {name = "mul.1"} {
+        ^bb0(%arg6: f32, %arg7: f32, %arg8: f32):
+          %10 = arith.mulf %arg6, %arg7 : f32
+          linalg.yield %10 : f32
+        } -> tensor<?xf32>
+        flow.dispatch.tensor.store %9, %2, offsets = [%arg5], sizes = [%5], strides = [1] : tensor<?xf32> -> !flow.dispatch.tensor<writeonly:?xf32>{%arg0_dim0}
+      }
+      return
+    }
+  }
+}
+// CHECK: vm.func private @simple_mul_dynamic
+func @simple_mul_dynamic(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  // CHECK: vm.call @hal.buffer_view.dim
+  %arg0_dim0 = tensor.dim %arg0, %c0 : tensor<?xf32>
+  // CHECK: vm.call @hal.buffer_view.dim
+  %arg1_dim0 = tensor.dim %arg1, %c0 : tensor<?xf32>
+  // CHECK: vm.call @hal.command_buffer.dispatch
+  %ret0 = flow.dispatch @executable_2::@dispatch[%arg0_dim0, %c1, %c1](%arg0, %arg0_dim0, %arg1, %arg1_dim0) : (tensor<?xf32>{%arg0_dim0}, index, tensor<?xf32>{%arg1_dim0}, index) -> tensor<?xf32>{%arg0_dim0}
+  return %ret0 : tensor<?xf32>
+}
+}  // module
+
+// -----
+
+// This shows the same element-wise multiply but without the first level of
+// tiling. This will execute in a single workgroup regardless of tensor size
+// (though here it's 4 so it wouldn't be distributed anyway).
+
+// CHECK-LABEL: vm.module public @untiled
+module @untiled {
+// CHECK: vm.rodata private @executable_3_vmvx_bytecode_fb
+stream.executable private @executable_3 {
+  stream.executable.export public @dispatch
+  builtin.module {
+    func @dispatch(%arg0: !stream.binding, %arg1: !stream.binding, %ret0: !stream.binding) {
+      %c0 = arith.constant 0 : index
+      %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
+      %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
+      %2 = stream.binding.subspan %ret0[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
+      %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
+      %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
+      %5 = linalg.init_tensor [4] : tensor<4xf32>
+      %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) {
+      ^bb0(%lhs: f32, %rhs: f32, %out: f32):
+        %7 = arith.mulf %lhs, %rhs : f32
+        linalg.yield %7 : f32
+      } -> tensor<4xf32>
+      flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
+      return
+    }
+  }
+}
+// CHECK: vm.func private @simple_mul_untiled
+func @simple_mul_untiled(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
+  %c1 = arith.constant 1 : index
+  %ret0 = flow.dispatch @executable_3::@dispatch[%c1, %c1, %c1](%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  return %ret0 : tensor<4xf32>
+}
+}  // module

diff --git a/iree/compiler/Utils/BUILD b/iree/compiler/Utils/BUILD
index ba3a667..78a9829 100644
--- a/iree/compiler/Utils/BUILD
+++ b/iree/compiler/Utils/BUILD

@@ -16,15 +16,18 @@
     name = "Utils",
     srcs = [
         "ConversionUtils.cpp",
+        "CustomKernelsTargetInfo.cpp",
         "FlatbufferUtils.cpp",
         "GraphUtils.cpp",
         "ModuleUtils.cpp",
         "OptionUtils.cpp",
         "PassUtils.cpp",
+        "StringUtils.cpp",
         "TracingUtils.cpp",
     ],
     hdrs = [
         "ConversionUtils.h",
+        "CustomKernelsTargetInfo.h",
         "FlatbufferUtils.h",
         "GraphUtils.h",
         "IndexSet.h",
@@ -32,6 +35,7 @@
         "OptionUtils.h",
         "PassUtils.h",
         "PatternUtils.h",
+        "StringUtils.h",
         "TracingUtils.h",
     ],
     deps = [

diff --git a/iree/compiler/Utils/CMakeLists.txt b/iree/compiler/Utils/CMakeLists.txt
index 87fe42f..eb73468 100644
--- a/iree/compiler/Utils/CMakeLists.txt
+++ b/iree/compiler/Utils/CMakeLists.txt

@@ -15,6 +15,7 @@
     Utils
   HDRS
     "ConversionUtils.h"
+    "CustomKernelsTargetInfo.h"
     "FlatbufferUtils.h"
     "GraphUtils.h"
     "IndexSet.h"
@@ -22,14 +23,17 @@
     "OptionUtils.h"
     "PassUtils.h"
     "PatternUtils.h"
+    "StringUtils.h"
     "TracingUtils.h"
   SRCS
     "ConversionUtils.cpp"
+    "CustomKernelsTargetInfo.cpp"
     "FlatbufferUtils.cpp"
     "GraphUtils.cpp"
     "ModuleUtils.cpp"
     "OptionUtils.cpp"
     "PassUtils.cpp"
+    "StringUtils.cpp"
     "TracingUtils.cpp"
   DEPS
     LLVMSupport

diff --git a/iree/compiler/Utils/CustomKernelsTargetInfo.cpp b/iree/compiler/Utils/CustomKernelsTargetInfo.cpp
new file mode 100644
index 0000000..184fdb8
--- /dev/null
+++ b/iree/compiler/Utils/CustomKernelsTargetInfo.cpp

@@ -0,0 +1,51 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/compiler/Utils/CustomKernelsTargetInfo.h"
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+
+namespace mlir {
+namespace iree_compiler {
+
+LogicalResult ParseCustomKernelTargetFeaturesForAarch64(
+    const llvm::SmallVector<llvm::StringRef> &features,
+    CustomKernelsTargetInfo &target_info) {
+  for (auto f : features) {
+    if (f == "+dotprod") {
+      target_info.add(CustomKernelTargetFeature::Aarch64Dotprod);
+    } else {
+      return failure();
+    }
+  }
+  return success();
+}
+
+LogicalResult ParseCustomKernelsTargetInfo(
+    llvm::StringRef archStr, llvm::StringRef featuresStr,
+    CustomKernelsTargetInfo &target_info) {
+  // Set the out-value to defaults early so that early returns produce
+  // consistent results and so that we can write simpler code below.
+  target_info = CustomKernelsTargetInfo();
+
+  if (archStr.empty()) {
+    return success();
+  }
+
+  llvm::SmallVector<llvm::StringRef> features;
+  featuresStr.split(features, ',');
+
+  if (archStr == "aarch64") {
+    target_info.init(CustomKernelTargetArch::Aarch64);
+    return ParseCustomKernelTargetFeaturesForAarch64(features, target_info);
+  }
+
+  return failure();
+}
+
+}  // namespace iree_compiler
+}  // namespace mlir

diff --git a/iree/compiler/Utils/CustomKernelsTargetInfo.h b/iree/compiler/Utils/CustomKernelsTargetInfo.h
new file mode 100644
index 0000000..375f338
--- /dev/null
+++ b/iree/compiler/Utils/CustomKernelsTargetInfo.h

@@ -0,0 +1,86 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_COMPILER_UTILS_CUSTOMKERNELTARGETINFO_H_
+#define IREE_COMPILER_UTILS_CUSTOMKERNELTARGETINFO_H_
+
+#include <stdint.h>
+
+#include <cassert>
+
+#include "mlir/Support/LogicalResult.h"
+
+namespace mlir {
+namespace iree_compiler {
+
+// Enumerates target ISAs that we care about. 'int8_t' because we somewhat
+// care because this is used in struct MMTKernel, which is passed by value.
+enum class CustomKernelTargetArch : int8_t { None, Aarch64 };
+
+// Enumerates arch-specific target features that we care about.
+// We explicitly want to stick to the default enumeration values (0, 1, 2, ...,
+// no greater than 63) because this is going to be indexing a uint64 bitfield.
+// Intentionally not reusing bits across architectures to be able to catch
+// most bugs. 64 is enough across all target architectures for now.
+enum class CustomKernelTargetFeature {
+  // Indicates a preference for intrinsics over inline asm. Unlike other bits,
+  // this is generic, not tied to a particular architecture or CPU feature, and
+  // it has to be passed through some custom boolean flag or option, not as
+  // part of the target CPU features.
+  Intrinsics,
+  // Aarch64 features.
+  Aarch64Dotprod,
+};
+
+inline bool isFeatureForArch(CustomKernelTargetFeature feature,
+                             CustomKernelTargetArch arch) {
+  switch (feature) {
+    case CustomKernelTargetFeature::Intrinsics:
+      return true;
+    case CustomKernelTargetFeature::Aarch64Dotprod:
+      return arch == CustomKernelTargetArch::Aarch64;
+  }
+  assert(false && "Unhandled CustomKernelTargetFeature value");
+  return false;
+}
+
+// Class used to pass some target information to patterns/passes that need it.
+// The information could come from pass options, e.g.
+//    -iree-llvmcpu-vector-contract-custom-kernels='arch=aarch64
+//    features=+dotprod intrinsics'
+// or from a parent HAL::ExecutableVariantOp and/or be complemented by a
+// global flag like clMmt4dUseIntrinsics.
+class CustomKernelsTargetInfo {
+ public:
+  void init(CustomKernelTargetArch a) {
+    assert(arch == CustomKernelTargetArch::None);
+    arch = a;
+  }
+  bool has(CustomKernelTargetFeature f) const {
+    if (!isFeatureForArch(f, arch)) {
+      return false;
+    }
+    return features & (1ull << static_cast<int>(f));
+  }
+  void add(CustomKernelTargetFeature f) {
+    assert(isFeatureForArch(f, arch));
+    features |= (1ull << static_cast<int>(f));
+  }
+
+ private:
+  CustomKernelTargetArch arch = CustomKernelTargetArch::None;
+  // Bitfield, with bits indexed by CustomKernelTargetFeature.
+  uint64_t features = 0;
+};
+
+LogicalResult ParseCustomKernelsTargetInfo(
+    llvm::StringRef archStr, llvm::StringRef featuresStr,
+    CustomKernelsTargetInfo &target_info);
+
+}  // namespace iree_compiler
+}  // namespace mlir
+
+#endif  // IREE_COMPILER_UTILS_CUSTOMKERNELTARGETINFO_H_

diff --git a/iree/compiler/Utils/StringUtils.cpp b/iree/compiler/Utils/StringUtils.cpp
new file mode 100644
index 0000000..94a727a
--- /dev/null
+++ b/iree/compiler/Utils/StringUtils.cpp

@@ -0,0 +1,29 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/compiler/Utils/StringUtils.h"
+
+namespace mlir {
+namespace iree_compiler {
+
+void replaceAllSubstrsInPlace(std::string &str, const std::string &match,
+                              const std::string &substitute) {
+  std::string::size_type scanLoc = 0, matchLoc = std::string::npos;
+  while ((matchLoc = str.find(match, scanLoc)) != std::string::npos) {
+    str.replace(matchLoc, match.size(), substitute);
+    scanLoc = matchLoc + substitute.size();
+  }
+}
+
+std::string replaceAllSubstrs(const std::string &str, const std::string &match,
+                              const std::string &substitute) {
+  std::string copy(str);
+  replaceAllSubstrsInPlace(copy, match, substitute);
+  return copy;
+}
+
+}  // namespace iree_compiler
+}  // namespace mlir

diff --git a/iree/compiler/Utils/StringUtils.h b/iree/compiler/Utils/StringUtils.h
new file mode 100644
index 0000000..dc378ce
--- /dev/null
+++ b/iree/compiler/Utils/StringUtils.h

@@ -0,0 +1,28 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_COMPILER_UTILS_STRINGUTILS_H_
+#define IREE_COMPILER_UTILS_STRINGUTILS_H_
+
+#include <string>
+
+namespace mlir {
+namespace iree_compiler {
+
+// Replaces all occurrences of `match` in `str` with `substitute`.
+// Operates in place.
+void replaceAllSubstrsInPlace(std::string &str, const std::string &match,
+                              const std::string &substitute);
+
+// Replaces all occurrences of `match` in `str` with `substitute`.
+// Does not mutate its arguments, returns the new string.
+std::string replaceAllSubstrs(const std::string &str, const std::string &match,
+                              const std::string &substitute);
+
+}  // namespace iree_compiler
+}  // namespace mlir
+
+#endif  // IREE_COMPILER_UTILS_STRINGUTILS_H_

diff --git a/iree/hal/cuda/BUILD b/iree/hal/cuda/BUILD
deleted file mode 100644
index a7161fb..0000000
--- a/iree/hal/cuda/BUILD
+++ /dev/null

@@ -1,106 +0,0 @@
-# Copyright 2021 The IREE Authors
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-load("//iree:build_defs.oss.bzl", "iree_cmake_extra_content")
-
-package(
-    default_visibility = ["//visibility:public"],
-    features = ["layering_check"],
-    licenses = ["notice"],  # Apache 2.0
-)
-
-iree_cmake_extra_content(
-    content = """
-if(NOT ${IREE_HAL_DRIVER_CUDA})
-  return()
-endif()
-""",
-)
-
-# Temporarily disabled pending build system changes.
-
-# cc_library(
-#     name = "cuda",
-#     srcs = [
-#         "api.h",
-#         "context_wrapper.h",
-#         "cuda_allocator.c",
-#         "cuda_allocator.h",
-#         "cuda_buffer.c",
-#         "cuda_buffer.h",
-#         "cuda_device.c",
-#         "cuda_device.h",
-#         "cuda_driver.c",
-#         "cuda_event.c",
-#         "cuda_event.h",
-#         "descriptor_set_layout.c",
-#         "descriptor_set_layout.h",
-#         "event_semaphore.c",
-#         "event_semaphore.h",
-#         "executable_layout.c",
-#         "executable_layout.h",
-#         "graph_command_buffer.c",
-#         "graph_command_buffer.h",
-#         "native_executable.c",
-#         "native_executable.h",
-#         "nop_executable_cache.c",
-#         "nop_executable_cache.h",
-#         "status_util.c",
-#         "status_util.h",
-#         "stream_command_buffer.c",
-#         "stream_command_buffer.h",
-#     ],
-#     hdrs = [
-#         "api.h",
-#     ],
-#     visibility = ["//visibility:public"],
-#     deps = [
-#         ":dynamic_symbols",
-#         "//iree/base",
-#         "//iree/base:core_headers",
-#         "//iree/base:tracing",
-#         "//iree/base/internal",
-#         "//iree/base/internal:arena",
-#         "//iree/base/internal:synchronization",
-#         "//iree/base/internal/flatcc:parsing",
-#         "//iree/hal",
-#         "//iree/hal/utils:buffer_transfer",
-#         "//iree/hal/utils:deferred_command_buffer",
-#         "//iree/hal/utils:resource_set",
-#         "//iree/schemas:cuda_executable_def_c_fbs",
-#     ],
-# )
-
-# cc_library(
-#     name = "dynamic_symbols",
-#     srcs = [
-#         "cuda_headers.h",
-#         "dynamic_symbols.c",
-#     ],
-#     hdrs = [
-#         "dynamic_symbols.h",
-#     ],
-#     textual_hdrs = [
-#         "dynamic_symbol_tables.h",
-#     ],
-#     deps = [
-#         "//iree/base:core_headers",
-#         "//iree/base:tracing",
-#         "//iree/base/internal:dynamic_library",
-#     ],
-# )
-
-# cc_test(
-#     name = "dynamic_symbols_test",
-#     srcs = ["dynamic_symbols_test.cc"],
-#     tags = ["driver=cuda"],
-#     deps = [
-#         ":dynamic_symbols",
-#         "//iree/base",
-#         "//iree/testing:gtest",
-#         "//iree/testing:gtest_main",
-#     ],
-# )

diff --git a/iree/hal/cuda/CMakeLists.txt b/iree/hal/cuda/CMakeLists.txt
index 709da11..76f3936 100644
--- a/iree/hal/cuda/CMakeLists.txt
+++ b/iree/hal/cuda/CMakeLists.txt

@@ -1,17 +1,98 @@
-################################################################################
-# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
-# iree/hal/cuda/BUILD                                                          #
-#                                                                              #
-# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
-# CMake-only content.                                                          #
-#                                                                              #
-# To disable autogeneration for this file entirely, delete this header.        #
-################################################################################
+# Copyright 2022 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-if(NOT ${IREE_HAL_DRIVER_CUDA})
+if(NOT IREE_HAL_DRIVER_CUDA)
   return()
 endif()
 
+if(NOT CUDAToolkit_INCLUDE_DIRS)
+  message(FATAL_ERROR "No CUDA SDK includes found: should have been set globally")
+endif()
+
 iree_add_all_subdirs()
 
-### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
+iree_cc_library(
+  NAME
+    cuda
+  HDRS
+    "api.h"
+  SRCS
+    "api.h"
+    "context_wrapper.h"
+    "cuda_allocator.c"
+    "cuda_allocator.h"
+    "cuda_buffer.c"
+    "cuda_buffer.h"
+    "cuda_device.c"
+    "cuda_device.h"
+    "cuda_driver.c"
+    "cuda_event.c"
+    "cuda_event.h"
+    "descriptor_set_layout.c"
+    "descriptor_set_layout.h"
+    "event_semaphore.c"
+    "event_semaphore.h"
+    "executable_layout.c"
+    "executable_layout.h"
+    "graph_command_buffer.c"
+    "graph_command_buffer.h"
+    "native_executable.c"
+    "native_executable.h"
+    "nop_executable_cache.c"
+    "nop_executable_cache.h"
+    "status_util.c"
+    "status_util.h"
+    "stream_command_buffer.c"
+    "stream_command_buffer.h"
+  DEPS
+    ::dynamic_symbols
+    iree::base
+    iree::base::core_headers
+    iree::base::internal
+    iree::base::internal::arena
+    iree::base::internal::flatcc::parsing
+    iree::base::internal::synchronization
+    iree::base::tracing
+    iree::hal
+    iree::hal::utils::buffer_transfer
+    iree::hal::utils::deferred_command_buffer
+    iree::hal::utils::resource_set
+    iree::schemas::cuda_executable_def_c_fbs
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
+    dynamic_symbols
+  HDRS
+    "dynamic_symbols.h"
+  TEXTUAL_HDRS
+    "dynamic_symbol_tables.h"
+  SRCS
+    "cuda_headers.h"
+    "dynamic_symbols.c"
+  INCLUDES
+    ${CUDAToolkit_INCLUDE_DIRS}
+  DEPS
+    iree::base::core_headers
+    iree::base::internal::dynamic_library
+    iree::base::tracing
+  PUBLIC
+)
+
+iree_cc_test(
+  NAME
+    dynamic_symbols_test
+  SRCS
+    "dynamic_symbols_test.cc"
+  DEPS
+    ::dynamic_symbols
+    iree::base
+    iree::testing::gtest
+    iree::testing::gtest_main
+  LABELS
+    "driver=cuda"
+)

diff --git a/iree/hal/cuda/cts/CMakeLists.txt b/iree/hal/cuda/cts/CMakeLists.txt
index 72c2269..e2d6e72 100644
--- a/iree/hal/cuda/cts/CMakeLists.txt
+++ b/iree/hal/cuda/cts/CMakeLists.txt

@@ -4,27 +4,26 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-# Temporarily disabled pending build system changes.
-# iree_hal_cts_test_suite(
-#   DRIVER_NAME
-#     cuda
-#   DRIVER_REGISTRATION_HDR
-#     "iree/hal/cuda/registration/driver_module.h"
-#   DRIVER_REGISTRATION_FN
-#     "iree_hal_cuda_driver_module_register"
-#   COMPILER_TARGET_BACKEND
-#     "cuda"
-#   EXECUTABLE_FORMAT
-#     "\"PTXE\""
-#   DEPS
-#     iree::hal::cuda::registration
-#   EXCLUDED_TESTS
-#     # This test depends on iree_hal_cuda_stream_command_buffer_update_buffer
-#     # via iree_hal_buffer_view_allocate_buffer, which is not implemented yet.
-#     "command_buffer_dispatch"
-#     # Non-push descriptor sets are not implemented in the CUDA backend yet.
-#     "descriptor_set"
-#     # Semaphores are not implemented in the CUDA backend yet.
-#     "semaphore_submission"
-#     "semaphore"
-# )
+iree_hal_cts_test_suite(
+  DRIVER_NAME
+    cuda
+  DRIVER_REGISTRATION_HDR
+    "iree/hal/cuda/registration/driver_module.h"
+  DRIVER_REGISTRATION_FN
+    "iree_hal_cuda_driver_module_register"
+  COMPILER_TARGET_BACKEND
+    "cuda"
+  EXECUTABLE_FORMAT
+    "\"PTXE\""
+  DEPS
+    iree::hal::cuda::registration
+  EXCLUDED_TESTS
+    # This test depends on iree_hal_cuda_stream_command_buffer_update_buffer
+    # via iree_hal_buffer_view_allocate_buffer, which is not implemented yet.
+    "command_buffer_dispatch"
+    # Non-push descriptor sets are not implemented in the CUDA backend yet.
+    "descriptor_set"
+    # Semaphores are not implemented in the CUDA backend yet.
+    "semaphore_submission"
+    "semaphore"
+)

diff --git a/iree/hal/cuda/registration/BUILD b/iree/hal/cuda/registration/BUILD
deleted file mode 100644
index cf162ed..0000000
--- a/iree/hal/cuda/registration/BUILD
+++ /dev/null

@@ -1,47 +0,0 @@
-# Copyright 2021 The IREE Authors
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-load("//iree:build_defs.oss.bzl", "iree_cmake_extra_content")
-
-package(
-    default_visibility = ["//visibility:public"],
-    features = ["layering_check"],
-    licenses = ["notice"],  # Apache 2.0
-)
-
-iree_cmake_extra_content(
-    content = """
-if(${IREE_HAL_DRIVER_CUDA})
-""",
-    inline = True,
-)
-
-# Temporarily disabled pending build system changes.
-
-# cc_library(
-#     name = "registration",
-#     srcs = ["driver_module.c"],
-#     hdrs = ["driver_module.h"],
-#     defines = [
-#         "IREE_HAL_HAVE_CUDA_DRIVER_MODULE=1",
-#     ],
-#     deps = [
-#         "//iree/base",
-#         "//iree/base:cc",
-#         "//iree/base:core_headers",
-#         "//iree/base:tracing",
-#         "//iree/base/internal:flags",
-#         "//iree/hal",
-#         "//iree/hal/cuda",
-#     ],
-# )
-
-iree_cmake_extra_content(
-    content = """
-endif()
-""",
-    inline = True,
-)

diff --git a/iree/hal/cuda/registration/CMakeLists.txt b/iree/hal/cuda/registration/CMakeLists.txt
index 95b06cb..a26a0e9 100644
--- a/iree/hal/cuda/registration/CMakeLists.txt
+++ b/iree/hal/cuda/registration/CMakeLists.txt

@@ -1,17 +1,31 @@
-################################################################################
-# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
-# iree/hal/cuda/registration/BUILD                                             #
-#                                                                              #
-# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
-# CMake-only content.                                                          #
-#                                                                              #
-# To disable autogeneration for this file entirely, delete this header.        #
-################################################################################
+# Copyright 2022 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 iree_add_all_subdirs()
 
-if(${IREE_HAL_DRIVER_CUDA})
-
+if(NOT IREE_HAL_DRIVER_CUDA)
+  return()
 endif()
 
-### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
+iree_cc_library(
+  NAME
+    registration
+  HDRS
+    "driver_module.h"
+  SRCS
+    "driver_module.c"
+  DEPS
+    iree::base
+    iree::base::cc
+    iree::base::core_headers
+    iree::base::internal::flags
+    iree::base::tracing
+    iree::hal
+    iree::hal::cuda
+  DEFINES
+    "IREE_HAL_HAVE_CUDA_DRIVER_MODULE=1"
+  PUBLIC
+)

diff --git a/iree/hal/drivers/CMakeLists.txt b/iree/hal/drivers/CMakeLists.txt
index 5836c3e..5dadc57 100644
--- a/iree/hal/drivers/CMakeLists.txt
+++ b/iree/hal/drivers/CMakeLists.txt

@@ -7,26 +7,25 @@
 # Doesn't use bazel_to_cmake because of custom configuration vars
 
 set(IREE_HAL_DRIVER_MODULES)
-# Temporarily disabled pending build system changes.
-# if(${IREE_HAL_DRIVER_CUDA})
-#   list(APPEND IREE_HAL_DRIVER_MODULES iree::hal::cuda::registration)
-# endif()
-if(${IREE_HAL_DRIVER_DYLIB})
+if(IREE_HAL_DRIVER_CUDA)
+  list(APPEND IREE_HAL_DRIVER_MODULES iree::hal::cuda::registration)
+endif()
+if(IREE_HAL_DRIVER_DYLIB)
   list(APPEND IREE_HAL_DRIVER_MODULES iree::hal::dylib::registration)
 endif()
-if(${IREE_HAL_DRIVER_DYLIB_SYNC})
+if(IREE_HAL_DRIVER_DYLIB_SYNC)
   list(APPEND IREE_HAL_DRIVER_MODULES iree::hal::dylib::registration::sync)
 endif()
-if(${IREE_HAL_DRIVER_VMVX})
+if(IREE_HAL_DRIVER_VMVX)
   list(APPEND IREE_HAL_DRIVER_MODULES iree::hal::vmvx::registration)
 endif()
-if(${IREE_HAL_DRIVER_VMVX_SYNC})
+if(IREE_HAL_DRIVER_VMVX_SYNC)
   list(APPEND IREE_HAL_DRIVER_MODULES iree::hal::vmvx::registration::sync)
 endif()
-if(${IREE_HAL_DRIVER_VULKAN})
+if(IREE_HAL_DRIVER_VULKAN)
   list(APPEND IREE_HAL_DRIVER_MODULES iree::hal::vulkan::registration)
 endif()
-if(${IREE_HAL_DRIVER_EXPERIMENTAL_ROCM})
+if(IREE_HAL_DRIVER_EXPERIMENTAL_ROCM)
   list(APPEND IREE_HAL_DRIVER_MODULES experimental::rocm::registration)
 endif()
 

diff --git a/iree/samples/custom_modules/dialect/custom_dialect.cc b/iree/samples/custom_modules/dialect/custom_dialect.cc
index 19620d4..2f90e98 100644
--- a/iree/samples/custom_modules/dialect/custom_dialect.cc
+++ b/iree/samples/custom_modules/dialect/custom_dialect.cc

@@ -44,7 +44,7 @@
  public:
   using VMConversionDialectInterface::VMConversionDialectInterface;
 
-  OwningModuleRef parseVMImportModule() const override {
+  OwningOpRef<mlir::ModuleOp> parseVMImportModule() const override {
     return mlir::parseSourceString(
         StringRef(iree_custom_imports_create()->data,
                   iree_custom_imports_create()->size),

diff --git a/iree/samples/simple_embedding/BUILD b/iree/samples/simple_embedding/BUILD
index 4914245..21911ee 100644
--- a/iree/samples/simple_embedding/BUILD
+++ b/iree/samples/simple_embedding/BUILD

@@ -273,7 +273,8 @@
     inline = True,
 )
 
-# Temporarily disabled pending build system changes.
+# Disabled because CUDA is not universally available and Bazel does not
+# support configurability in a reasonable or useful way.
 # iree_cmake_extra_content(
 #     content = """
 # if(${IREE_HAL_DRIVER_CUDA} AND (${IREE_TARGET_BACKEND_CUDA} OR DEFINED IREE_HOST_BINARY_ROOT))
@@ -310,14 +311,13 @@
 #     ],
 # )
 
-# Simple embedding is failing in the CI.
-# native_test(
-#     name = "simple_embedding_cuda_test",
-#     tags = [
-#         "driver=cuda",
-#     ],
-#     src = ":simple_embedding_cuda",
-# )
+# # native_test(
+# #     name = "simple_embedding_cuda_test",
+# #     src = ":simple_embedding_cuda",
+# #     tags = [
+# #         "driver=cuda",
+# #     ],
+# # )
 
 # iree_cmake_extra_content(
 #     content = """

diff --git a/iree/test/e2e/cuda_specific/BUILD b/iree/test/e2e/cuda_specific/BUILD
deleted file mode 100644
index a067545..0000000
--- a/iree/test/e2e/cuda_specific/BUILD
+++ /dev/null

@@ -1,35 +0,0 @@
-# Copyright 2021 The IREE Authors
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-# Tests for end-to-end IREE support specific to the CUDA backend to be able to
-# incrementally enable features.
-
-# load("//build_tools/bazel:iree_check_test.bzl", "iree_check_single_backend_test_suite")
-
-package(
-    default_visibility = ["//visibility:public"],
-    features = ["layering_check"],
-    licenses = ["notice"],  # Apache 2.0
-)
-
-# Temporarily disabled pending build system changes.
-# iree_check_single_backend_test_suite(
-#     name = "check_cuda",
-#     srcs = [
-#         "dot.mlir",
-#     ],
-#     compiler_flags = ["-iree-input-type=mhlo"],
-#     driver = "cuda",
-#     tags = [
-#         # CUDA cuInit fails with sanitizer on.
-#         "noasan",
-#         "nomsan",
-#         "notsan",
-#         "noubsan",
-#         "requires-gpu-nvidia",
-#     ],
-#     target_backend = "cuda",
-# )

diff --git a/iree/test/e2e/cuda_specific/CMakeLists.txt b/iree/test/e2e/cuda_specific/CMakeLists.txt
index ab173dc..4730d22 100644
--- a/iree/test/e2e/cuda_specific/CMakeLists.txt
+++ b/iree/test/e2e/cuda_specific/CMakeLists.txt

@@ -1,13 +1,29 @@
-################################################################################
-# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
-# iree/test/e2e/cuda_specific/BUILD                                            #
-#                                                                              #
-# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
-# CMake-only content.                                                          #
-#                                                                              #
-# To disable autogeneration for this file entirely, delete this header.        #
-################################################################################
+# Copyright 2022 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# Tests for end-to-end IREE support specific to the CUDA backend to be able to
+# incrementally enable features.
 
 iree_add_all_subdirs()
 
-### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
+iree_check_single_backend_test_suite(
+  NAME
+    check_cuda
+  SRCS
+    "dot.mlir"
+  TARGET_BACKEND
+    "cuda"
+  DRIVER
+    "cuda"
+  COMPILER_FLAGS
+    "-iree-input-type=mhlo"
+  LABELS
+    "noasan"
+    "nomsan"
+    "notsan"
+    "noubsan"
+    "requires-gpu-nvidia"
+)

diff --git a/iree/test/e2e/linalg_ext_ops/BUILD b/iree/test/e2e/linalg_ext_ops/BUILD
index 343c8b1..824d951 100644
--- a/iree/test/e2e/linalg_ext_ops/BUILD
+++ b/iree/test/e2e/linalg_ext_ops/BUILD

@@ -13,30 +13,29 @@
     licenses = ["notice"],  # Apache 2.0
 )
 
-# Temporarily disabled pending build system changes.
-# iree_check_single_backend_test_suite(
-#     name = "check_cuda",
-#     srcs = enforce_glob(
-#         # keep sorted
-#         [
-#             "reverse.mlir",
-#             "scan.mlir",
-#         ],
-#         include = ["*.mlir"],
-#         exclude = [
-#         ],
-#     ),
-#     driver = "cuda",
-#     tags = [
-#         # CUDA cuInit fails with sanitizer on.
-#         "noasan",
-#         "nomsan",
-#         "notsan",
-#         "noubsan",
-#         "requires-gpu-nvidia",
-#     ],
-#     target_backend = "cuda",
-# )
+iree_check_single_backend_test_suite(
+    name = "check_cuda",
+    srcs = enforce_glob(
+        # keep sorted
+        [
+            "reverse.mlir",
+            "scan.mlir",
+        ],
+        include = ["*.mlir"],
+        exclude = [
+        ],
+    ),
+    driver = "cuda",
+    tags = [
+        # CUDA cuInit fails with sanitizer on.
+        "noasan",
+        "nomsan",
+        "notsan",
+        "noubsan",
+        "requires-gpu-nvidia",
+    ],
+    target_backend = "cuda",
+)
 
 iree_check_single_backend_test_suite(
     name = "check_dylib-llvm-aot_dylib",

diff --git a/iree/test/e2e/linalg_ext_ops/CMakeLists.txt b/iree/test/e2e/linalg_ext_ops/CMakeLists.txt
index 7aed204..3d9d628 100644
--- a/iree/test/e2e/linalg_ext_ops/CMakeLists.txt
+++ b/iree/test/e2e/linalg_ext_ops/CMakeLists.txt

@@ -12,6 +12,24 @@
 
 iree_check_single_backend_test_suite(
   NAME
+    check_cuda
+  SRCS
+    "reverse.mlir"
+    "scan.mlir"
+  TARGET_BACKEND
+    "cuda"
+  DRIVER
+    "cuda"
+  LABELS
+    "noasan"
+    "nomsan"
+    "notsan"
+    "noubsan"
+    "requires-gpu-nvidia"
+)
+
+iree_check_single_backend_test_suite(
+  NAME
     check_dylib-llvm-aot_dylib
   SRCS
     "reverse.mlir"

diff --git a/iree/test/e2e/models/BUILD b/iree/test/e2e/models/BUILD
index d0b1107..ae84e50 100644
--- a/iree/test/e2e/models/BUILD
+++ b/iree/test/e2e/models/BUILD

@@ -74,20 +74,19 @@
     target_backend = "vulkan-spirv",
 )
 
-# Temporarily disabled pending build system changes.
-# iree_check_single_backend_test_suite(
-#     name = "check_cuda_cuda",
-#     timeout = "long",
-#     srcs = CHECK_FRAMEWORK_TESTS,
-#     compiler_flags = ["-iree-input-type=mhlo"],
-#     driver = "cuda",
-#     tags = [
-#         # CUDA cuInit fails with sanitizer on.
-#         "noasan",
-#         "nomsan",
-#         "notsan",
-#         "noubsan",
-#         "requires-gpu-nvidia",
-#     ],
-#     target_backend = "cuda",
-# )
+iree_check_single_backend_test_suite(
+    name = "check_cuda_cuda",
+    timeout = "long",
+    srcs = CHECK_FRAMEWORK_TESTS,
+    compiler_flags = ["-iree-input-type=mhlo"],
+    driver = "cuda",
+    tags = [
+        # CUDA cuInit fails with sanitizer on.
+        "noasan",
+        "nomsan",
+        "notsan",
+        "noubsan",
+        "requires-gpu-nvidia",
+    ],
+    target_backend = "cuda",
+)

diff --git a/iree/test/e2e/models/CMakeLists.txt b/iree/test/e2e/models/CMakeLists.txt
index a668981..a15eb09 100644
--- a/iree/test/e2e/models/CMakeLists.txt
+++ b/iree/test/e2e/models/CMakeLists.txt

@@ -58,4 +58,24 @@
     "-iree-input-type=mhlo"
 )
 
+iree_check_single_backend_test_suite(
+  NAME
+    check_cuda_cuda
+  SRCS
+    "bert_encoder_unrolled_fake_weights.mlir"
+    "mobilenetv3_fake_weights.mlir"
+  TARGET_BACKEND
+    "cuda"
+  DRIVER
+    "cuda"
+  COMPILER_FLAGS
+    "-iree-input-type=mhlo"
+  LABELS
+    "noasan"
+    "nomsan"
+    "notsan"
+    "noubsan"
+    "requires-gpu-nvidia"
+)
+
 ### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###

diff --git a/iree/test/e2e/regression/BUILD b/iree/test/e2e/regression/BUILD
index c57be78..bc0e05b 100644
--- a/iree/test/e2e/regression/BUILD
+++ b/iree/test/e2e/regression/BUILD

@@ -91,22 +91,21 @@
     target_backend = "vulkan-spirv",
 )
 
-# Temporarily disabled pending build system changes.
-# iree_check_single_backend_test_suite(
-#     name = "check_regression_cuda",
-#     srcs = BACKEND_TESTS,
-#     compiler_flags = ["-iree-input-type=mhlo"],
-#     driver = "cuda",
-#     tags = [
-#         # CUDA cuInit fails with sanitizer on.
-#         "noasan",
-#         "nomsan",
-#         "notsan",
-#         "noubsan",
-#         "requires-gpu-nvidia",
-#     ],
-#     target_backend = "cuda",
-# )
+iree_check_single_backend_test_suite(
+    name = "check_regression_cuda",
+    srcs = BACKEND_TESTS,
+    compiler_flags = ["-iree-input-type=mhlo"],
+    driver = "cuda",
+    tags = [
+        # CUDA cuInit fails with sanitizer on.
+        "noasan",
+        "nomsan",
+        "notsan",
+        "noubsan",
+        "requires-gpu-nvidia",
+    ],
+    target_backend = "cuda",
+)
 
 py_binary(
     name = "generate_e2e_matmul_tests",
@@ -139,16 +138,13 @@
         "--shapes=small",
     ],
     opt_flags = [
-        "--iree-flow-convert-linalg-matmul-to-mmt4d=M0=8 K0=%d N0=8" % (4 if lhs_rhs_type == "i8" else 1),
+        "--iree-flow-convert-linalg-matmul-to-mmt4d=enable_generic_slow #pass_options_variant#",
     ],
     target_backends_and_drivers = [
         ("dylib-llvm-aot", "dylib"),
-        ("vmvx", "vmvx"),
     ],
-    target_cpu_features_variants = [
-        "default",
-        "aarch64:+dotprod",
-    ],
+    target_cpu_features_variants = ["default"] +
+                                   (["aarch64:+dotprod"] if lhs_rhs_type == "i8" else []),
     trace_runner = "//iree/tools:iree-e2e-matmul-test",
 ) for lhs_rhs_type in [
     "i8",
@@ -163,16 +159,13 @@
         "--shapes=large",
     ],
     opt_flags = [
-        "--iree-flow-convert-linalg-matmul-to-mmt4d=M0=8 K0=%d N0=8" % (4 if lhs_rhs_type == "i8" else 1),
+        "--iree-flow-convert-linalg-matmul-to-mmt4d=enable_generic_slow #pass_options_variant#",
     ],
     target_backends_and_drivers = [
         ("dylib-llvm-aot", "dylib"),
-        # TODO: enable VMVX. Skipped for now: it's very slow for these large matmul tests.
     ],
-    target_cpu_features_variants = [
-        "default",
-        "aarch64:+dotprod",
-    ],
+    target_cpu_features_variants = ["default"] +
+                                   (["aarch64:+dotprod"] if lhs_rhs_type == "i8" else []),
     trace_runner = "//iree/tools:iree-e2e-matmul-test",
 ) for lhs_rhs_type in [
     "i8",
@@ -180,7 +173,8 @@
 ]]
 
 # Test intrinsics. No need to run vmvx again, since it isn't affected by this
-# codegen flag.
+# codegen flag. No need to run "large" sizes, since this only differs from other
+# tests in ways that are orthogonal to problem sizes.
 [iree_generated_trace_runner_test(
     name = "e2e_matmul_mmt4d_%s_intrinsics_%s" % (lhs_rhs_type, size),
     compiler_flags = ["--iree-codegen-mmt4d-use-intrinsics"],
@@ -190,20 +184,17 @@
         "--shapes=%s" % size,
     ],
     opt_flags = [
-        "--iree-flow-convert-linalg-matmul-to-mmt4d=M0=8 K0=%d N0=8" % (4 if lhs_rhs_type == "i8" else 1),
+        "--iree-flow-convert-linalg-matmul-to-mmt4d=enable_generic_slow #pass_options_variant#",
     ],
     target_backends_and_drivers = [
         ("dylib-llvm-aot", "dylib"),
     ],
-    target_cpu_features_variants = [
-        "default",
-        "aarch64:+dotprod",
-    ],
+    target_cpu_features_variants = ["default"] +
+                                   (["aarch64:+dotprod"] if lhs_rhs_type == "i8" else []),
     trace_runner = "//iree/tools:iree-e2e-matmul-test",
 ) for lhs_rhs_type in [
     "i8",
     "f32",
 ] for size in [
     "small",
-    "large",
 ]]

diff --git a/iree/test/e2e/regression/CMakeLists.txt b/iree/test/e2e/regression/CMakeLists.txt
index d3aedfd..31cc3df 100644
--- a/iree/test/e2e/regression/CMakeLists.txt
+++ b/iree/test/e2e/regression/CMakeLists.txt

@@ -96,6 +96,34 @@
     "-iree-input-type=mhlo"
 )
 
+iree_check_single_backend_test_suite(
+  NAME
+    check_regression_cuda
+  SRCS
+    "dynamic_abs.mlir"
+    "dynamic_add.mlir"
+    "dynamic_dot.mlir"
+    "dynamic_reduce_min.mlir"
+    "dynamic_torch_index_select_high_rank.mlir"
+    "dynamic_torch_index_select_negative.mlir"
+    "dynamic_torch_index_select_scalar.mlir"
+    "dynamic_torch_index_select_vector.mlir"
+    "linalg_ext_ops.mlir"
+    "linalg_ops.mlir"
+  TARGET_BACKEND
+    "cuda"
+  DRIVER
+    "cuda"
+  COMPILER_FLAGS
+    "-iree-input-type=mhlo"
+  LABELS
+    "noasan"
+    "nomsan"
+    "notsan"
+    "noubsan"
+    "requires-gpu-nvidia"
+)
+
 iree_generated_trace_runner_test(
   NAME
     e2e_matmul_direct_i8_small
@@ -144,12 +172,10 @@
     iree_tools_iree-e2e-matmul-test
   TARGET_BACKENDS
     "dylib-llvm-aot"
-    "vmvx"
   DRIVERS
     "dylib"
-    "vmvx"
   OPT_FLAGS
-    "--iree-flow-convert-linalg-matmul-to-mmt4d=M0=8 K0=4 N0=8"
+    "--iree-flow-convert-linalg-matmul-to-mmt4d=enable_generic_slow #pass_options_variant#"
   TARGET_CPU_FEATURES_VARIANTS
     "default"
     "aarch64:+dotprod"
@@ -167,15 +193,12 @@
     iree_tools_iree-e2e-matmul-test
   TARGET_BACKENDS
     "dylib-llvm-aot"
-    "vmvx"
   DRIVERS
     "dylib"
-    "vmvx"
   OPT_FLAGS
-    "--iree-flow-convert-linalg-matmul-to-mmt4d=M0=8 K0=1 N0=8"
+    "--iree-flow-convert-linalg-matmul-to-mmt4d=enable_generic_slow #pass_options_variant#"
   TARGET_CPU_FEATURES_VARIANTS
     "default"
-    "aarch64:+dotprod"
 )
 
 iree_generated_trace_runner_test(
@@ -193,7 +216,7 @@
   DRIVERS
     "dylib"
   OPT_FLAGS
-    "--iree-flow-convert-linalg-matmul-to-mmt4d=M0=8 K0=4 N0=8"
+    "--iree-flow-convert-linalg-matmul-to-mmt4d=enable_generic_slow #pass_options_variant#"
   TARGET_CPU_FEATURES_VARIANTS
     "default"
     "aarch64:+dotprod"
@@ -214,10 +237,9 @@
   DRIVERS
     "dylib"
   OPT_FLAGS
-    "--iree-flow-convert-linalg-matmul-to-mmt4d=M0=8 K0=1 N0=8"
+    "--iree-flow-convert-linalg-matmul-to-mmt4d=enable_generic_slow #pass_options_variant#"
   TARGET_CPU_FEATURES_VARIANTS
     "default"
-    "aarch64:+dotprod"
 )
 
 iree_generated_trace_runner_test(
@@ -237,30 +259,7 @@
   COMPILER_FLAGS
     "--iree-codegen-mmt4d-use-intrinsics"
   OPT_FLAGS
-    "--iree-flow-convert-linalg-matmul-to-mmt4d=M0=8 K0=4 N0=8"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "aarch64:+dotprod"
-)
-
-iree_generated_trace_runner_test(
-  NAME
-    e2e_matmul_mmt4d_i8_intrinsics_large
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=i8"
-    "--shapes=large"
-  TRACE_RUNNER
-    iree_tools_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "dylib-llvm-aot"
-  DRIVERS
-    "dylib"
-  COMPILER_FLAGS
-    "--iree-codegen-mmt4d-use-intrinsics"
-  OPT_FLAGS
-    "--iree-flow-convert-linalg-matmul-to-mmt4d=M0=8 K0=4 N0=8"
+    "--iree-flow-convert-linalg-matmul-to-mmt4d=enable_generic_slow #pass_options_variant#"
   TARGET_CPU_FEATURES_VARIANTS
     "default"
     "aarch64:+dotprod"
@@ -283,33 +282,9 @@
   COMPILER_FLAGS
     "--iree-codegen-mmt4d-use-intrinsics"
   OPT_FLAGS
-    "--iree-flow-convert-linalg-matmul-to-mmt4d=M0=8 K0=1 N0=8"
+    "--iree-flow-convert-linalg-matmul-to-mmt4d=enable_generic_slow #pass_options_variant#"
   TARGET_CPU_FEATURES_VARIANTS
     "default"
-    "aarch64:+dotprod"
-)
-
-iree_generated_trace_runner_test(
-  NAME
-    e2e_matmul_mmt4d_f32_intrinsics_large
-  GENERATOR
-    "generate_e2e_matmul_tests.py"
-  GENERATOR_ARGS
-    "--lhs_rhs_type=f32"
-    "--shapes=large"
-  TRACE_RUNNER
-    iree_tools_iree-e2e-matmul-test
-  TARGET_BACKENDS
-    "dylib-llvm-aot"
-  DRIVERS
-    "dylib"
-  COMPILER_FLAGS
-    "--iree-codegen-mmt4d-use-intrinsics"
-  OPT_FLAGS
-    "--iree-flow-convert-linalg-matmul-to-mmt4d=M0=8 K0=1 N0=8"
-  TARGET_CPU_FEATURES_VARIANTS
-    "default"
-    "aarch64:+dotprod"
 )
 
 ### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###

diff --git a/iree/test/e2e/tensor_ops/BUILD b/iree/test/e2e/tensor_ops/BUILD
index ba43404..1c694b2 100644
--- a/iree/test/e2e/tensor_ops/BUILD
+++ b/iree/test/e2e/tensor_ops/BUILD

@@ -51,30 +51,29 @@
     target_backend = "dylib-llvm-aot",
 )
 
-# Temporarily disabled pending build system changes.
-# iree_check_single_backend_test_suite(
-#     name = "check_cuda",
-#     srcs = enforce_glob(
-#         # keep sorted
-#         [
-#             "extract_slice.mlir",
-#             "tensor_insert_slice.mlir",
-#         ],
-#         include = ["*.mlir"],
-#         exclude = [
-#             "tensor_cast.mlir",
-#         ],
-#     ),
-#     driver = "cuda",
-#     tags = [
-#         "noasan",
-#         "nomsan",
-#         "notsan",
-#         "noubsan",
-#         "requires-gpu-nvidia",
-#     ],
-#     target_backend = "cuda",
-# )
+iree_check_single_backend_test_suite(
+    name = "check_cuda",
+    srcs = enforce_glob(
+        # keep sorted
+        [
+            "extract_slice.mlir",
+            "tensor_insert_slice.mlir",
+        ],
+        include = ["*.mlir"],
+        exclude = [
+            "tensor_cast.mlir",
+        ],
+    ),
+    driver = "cuda",
+    tags = [
+        "noasan",
+        "nomsan",
+        "notsan",
+        "noubsan",
+        "requires-gpu-nvidia",
+    ],
+    target_backend = "cuda",
+)
 
 iree_check_single_backend_test_suite(
     name = "check_vulkan-spirv_vulkan",

diff --git a/iree/test/e2e/tensor_ops/CMakeLists.txt b/iree/test/e2e/tensor_ops/CMakeLists.txt
index 279a5c4..2acc931 100644
--- a/iree/test/e2e/tensor_ops/CMakeLists.txt
+++ b/iree/test/e2e/tensor_ops/CMakeLists.txt

@@ -38,6 +38,24 @@
 
 iree_check_single_backend_test_suite(
   NAME
+    check_cuda
+  SRCS
+    "extract_slice.mlir"
+    "tensor_insert_slice.mlir"
+  TARGET_BACKEND
+    "cuda"
+  DRIVER
+    "cuda"
+  LABELS
+    "noasan"
+    "nomsan"
+    "notsan"
+    "noubsan"
+    "requires-gpu-nvidia"
+)
+
+iree_check_single_backend_test_suite(
+  NAME
     check_vulkan-spirv_vulkan
   SRCS
     "extract_slice.mlir"

diff --git a/iree/test/e2e/xla_ops/BUILD b/iree/test/e2e/xla_ops/BUILD
index 50f0354..20e3b73 100644
--- a/iree/test/e2e/xla_ops/BUILD
+++ b/iree/test/e2e/xla_ops/BUILD

@@ -19,167 +19,166 @@
     licenses = ["notice"],  # Apache 2.0
 )
 
-# Temporarily disabled pending build system changes.
-# iree_check_single_backend_test_suite(
-#     name = "check_cuda_graph",
-#     srcs = enforce_glob(
-#         # keep sorted
-#         [
-#             "abs.mlir",
-#             "add.mlir",
-#             "batch_norm_inference.mlir",
-#             "bitcast_convert.mlir",
-#             "broadcast.mlir",
-#             "broadcast_add.mlir",
-#             "broadcast_in_dim.mlir",
-#             "clamp.mlir",
-#             "compare.mlir",
-#             "concatenate.mlir",
-#             "constant.mlir",
-#             "convert.mlir",
-#             "convolution.mlir",
-#             "cosine.mlir",
-#             "divide.mlir",
-#             "dot.mlir",
-#             "dot_general.mlir",
-#             "dynamic_slice.mlir",
-#             "dynamic_update_slice.mlir",
-#             "exponential.mlir",
-#             "exponential_fp16.mlir",
-#             "exponential_minus_one.mlir",
-#             "fft.mlir",
-#             "finite.mlir",
-#             "floor.mlir",
-#             "gather.mlir",
-#             "iota.mlir",
-#             "log.mlir",
-#             "log_plus_one.mlir",
-#             "maximum.mlir",
-#             "minimum.mlir",
-#             "multiply.mlir",
-#             "negate.mlir",
-#             "pad.mlir",
-#             "pow.mlir",
-#             "reduce.mlir",
-#             "reduce_window.mlir",
-#             "remainder.mlir",
-#             "reshape.mlir",
-#             "reverse.mlir",
-#             "rng_normal.mlir",
-#             "rng_uniform.mlir",
-#             "rsqrt.mlir",
-#             "scatter.mlir",
-#             "scatter_dynamic.mlir",
-#             "select.mlir",
-#             "sine.mlir",
-#             "slice.mlir",
-#             "sort.mlir",
-#             "sqrt.mlir",
-#             "subtract.mlir",
-#             "tanh.mlir",
-#             "torch_index_select.mlir",
-#             "transpose.mlir",
-#             "while.mlir",
-#         ],
-#         include = ["*.mlir"],
-#         exclude = [
-#             "round.mlir",
-#         ],
-#     ),
-#     compiler_flags = ["-iree-input-type=mhlo"],
-#     driver = "cuda",
-#     runner_args = ["--cuda_use_streams=false"],
-#     tags = [
-#         # CUDA cuInit fails with sanitizer on.
-#         "noasan",
-#         "nomsan",
-#         "notsan",
-#         "noubsan",
-#         "requires-gpu-nvidia",
-#     ],
-#     target_backend = "cuda",
-# )
+iree_check_single_backend_test_suite(
+    name = "check_cuda_graph",
+    srcs = enforce_glob(
+        # keep sorted
+        [
+            "abs.mlir",
+            "add.mlir",
+            "batch_norm_inference.mlir",
+            "bitcast_convert.mlir",
+            "broadcast.mlir",
+            "broadcast_add.mlir",
+            "broadcast_in_dim.mlir",
+            "clamp.mlir",
+            "compare.mlir",
+            "concatenate.mlir",
+            "constant.mlir",
+            "convert.mlir",
+            "convolution.mlir",
+            "cosine.mlir",
+            "divide.mlir",
+            "dot.mlir",
+            "dot_general.mlir",
+            "dynamic_slice.mlir",
+            "dynamic_update_slice.mlir",
+            "exponential.mlir",
+            "exponential_fp16.mlir",
+            "exponential_minus_one.mlir",
+            "fft.mlir",
+            "finite.mlir",
+            "floor.mlir",
+            "gather.mlir",
+            "iota.mlir",
+            "log.mlir",
+            "log_plus_one.mlir",
+            "maximum.mlir",
+            "minimum.mlir",
+            "multiply.mlir",
+            "negate.mlir",
+            "pad.mlir",
+            "pow.mlir",
+            "reduce.mlir",
+            "reduce_window.mlir",
+            "remainder.mlir",
+            "reshape.mlir",
+            "reverse.mlir",
+            "rng_normal.mlir",
+            "rng_uniform.mlir",
+            "rsqrt.mlir",
+            "scatter.mlir",
+            "scatter_dynamic.mlir",
+            "select.mlir",
+            "sine.mlir",
+            "slice.mlir",
+            "sort.mlir",
+            "sqrt.mlir",
+            "subtract.mlir",
+            "tanh.mlir",
+            "torch_index_select.mlir",
+            "transpose.mlir",
+            "while.mlir",
+        ],
+        include = ["*.mlir"],
+        exclude = [
+            "round.mlir",
+        ],
+    ),
+    compiler_flags = ["-iree-input-type=mhlo"],
+    driver = "cuda",
+    runner_args = ["--cuda_use_streams=false"],
+    tags = [
+        # CUDA cuInit fails with sanitizer on.
+        "noasan",
+        "nomsan",
+        "notsan",
+        "noubsan",
+        "requires-gpu-nvidia",
+    ],
+    target_backend = "cuda",
+)
 
 # # Run cuda tests using stream command buffer
-# iree_check_single_backend_test_suite(
-#     name = "check_cuda_streams",
-#     srcs = enforce_glob(
-#         # keep sorted
-#         [
-#             "abs.mlir",
-#             "add.mlir",
-#             "batch_norm_inference.mlir",
-#             "bitcast_convert.mlir",
-#             "broadcast.mlir",
-#             "broadcast_add.mlir",
-#             "broadcast_in_dim.mlir",
-#             "clamp.mlir",
-#             "compare.mlir",
-#             "concatenate.mlir",
-#             "constant.mlir",
-#             "convert.mlir",
-#             "convolution.mlir",
-#             "cosine.mlir",
-#             "divide.mlir",
-#             "dot.mlir",
-#             "dot_general.mlir",
-#             "dynamic_slice.mlir",
-#             "dynamic_update_slice.mlir",
-#             "exponential.mlir",
-#             "exponential_fp16.mlir",
-#             "exponential_minus_one.mlir",
-#             "fft.mlir",
-#             "finite.mlir",
-#             "floor.mlir",
-#             "gather.mlir",
-#             "iota.mlir",
-#             "log.mlir",
-#             "log_plus_one.mlir",
-#             "maximum.mlir",
-#             "minimum.mlir",
-#             "multiply.mlir",
-#             "negate.mlir",
-#             "pad.mlir",
-#             "pow.mlir",
-#             "reduce.mlir",
-#             "reduce_window.mlir",
-#             "remainder.mlir",
-#             "reshape.mlir",
-#             "reverse.mlir",
-#             "rng_normal.mlir",
-#             "rng_uniform.mlir",
-#             "rsqrt.mlir",
-#             "scatter.mlir",
-#             "scatter_dynamic.mlir",
-#             "select.mlir",
-#             "sine.mlir",
-#             "slice.mlir",
-#             "sort.mlir",
-#             "sqrt.mlir",
-#             "subtract.mlir",
-#             "tanh.mlir",
-#             "torch_index_select.mlir",
-#             "transpose.mlir",
-#             "while.mlir",
-#         ],
-#         include = ["*.mlir"],
-#         exclude = [
-#             "round.mlir",
-#         ],
-#     ),
-#     compiler_flags = ["-iree-input-type=mhlo"],
-#     driver = "cuda",
-#     runner_args = ["--cuda_use_streams=true"],
-#     tags = [
-#         # CUDA cuInit fails with sanitizer on.
-#         "noasan",
-#         "nomsan",
-#         "notsan",
-#         "noubsan",
-#         "requires-gpu-nvidia",
-#     ],
-#     target_backend = "cuda",
-# )
+iree_check_single_backend_test_suite(
+    name = "check_cuda_streams",
+    srcs = enforce_glob(
+        # keep sorted
+        [
+            "abs.mlir",
+            "add.mlir",
+            "batch_norm_inference.mlir",
+            "bitcast_convert.mlir",
+            "broadcast.mlir",
+            "broadcast_add.mlir",
+            "broadcast_in_dim.mlir",
+            "clamp.mlir",
+            "compare.mlir",
+            "concatenate.mlir",
+            "constant.mlir",
+            "convert.mlir",
+            "convolution.mlir",
+            "cosine.mlir",
+            "divide.mlir",
+            "dot.mlir",
+            "dot_general.mlir",
+            "dynamic_slice.mlir",
+            "dynamic_update_slice.mlir",
+            "exponential.mlir",
+            "exponential_fp16.mlir",
+            "exponential_minus_one.mlir",
+            "fft.mlir",
+            "finite.mlir",
+            "floor.mlir",
+            "gather.mlir",
+            "iota.mlir",
+            "log.mlir",
+            "log_plus_one.mlir",
+            "maximum.mlir",
+            "minimum.mlir",
+            "multiply.mlir",
+            "negate.mlir",
+            "pad.mlir",
+            "pow.mlir",
+            "reduce.mlir",
+            "reduce_window.mlir",
+            "remainder.mlir",
+            "reshape.mlir",
+            "reverse.mlir",
+            "rng_normal.mlir",
+            "rng_uniform.mlir",
+            "rsqrt.mlir",
+            "scatter.mlir",
+            "scatter_dynamic.mlir",
+            "select.mlir",
+            "sine.mlir",
+            "slice.mlir",
+            "sort.mlir",
+            "sqrt.mlir",
+            "subtract.mlir",
+            "tanh.mlir",
+            "torch_index_select.mlir",
+            "transpose.mlir",
+            "while.mlir",
+        ],
+        include = ["*.mlir"],
+        exclude = [
+            "round.mlir",
+        ],
+    ),
+    compiler_flags = ["-iree-input-type=mhlo"],
+    driver = "cuda",
+    runner_args = ["--cuda_use_streams=true"],
+    tags = [
+        # CUDA cuInit fails with sanitizer on.
+        "noasan",
+        "nomsan",
+        "notsan",
+        "noubsan",
+        "requires-gpu-nvidia",
+    ],
+    target_backend = "cuda",
+)
 
 iree_check_single_backend_test_suite(
     name = "check_dylib-llvm-aot_dylib",

diff --git a/iree/test/e2e/xla_ops/CMakeLists.txt b/iree/test/e2e/xla_ops/CMakeLists.txt
index 0b0086e..05236c9 100644
--- a/iree/test/e2e/xla_ops/CMakeLists.txt
+++ b/iree/test/e2e/xla_ops/CMakeLists.txt

@@ -12,6 +12,156 @@
 
 iree_check_single_backend_test_suite(
   NAME
+    check_cuda_graph
+  SRCS
+    "abs.mlir"
+    "add.mlir"
+    "batch_norm_inference.mlir"
+    "bitcast_convert.mlir"
+    "broadcast.mlir"
+    "broadcast_add.mlir"
+    "broadcast_in_dim.mlir"
+    "clamp.mlir"
+    "compare.mlir"
+    "concatenate.mlir"
+    "constant.mlir"
+    "convert.mlir"
+    "convolution.mlir"
+    "cosine.mlir"
+    "divide.mlir"
+    "dot.mlir"
+    "dot_general.mlir"
+    "dynamic_slice.mlir"
+    "dynamic_update_slice.mlir"
+    "exponential.mlir"
+    "exponential_fp16.mlir"
+    "exponential_minus_one.mlir"
+    "fft.mlir"
+    "finite.mlir"
+    "floor.mlir"
+    "gather.mlir"
+    "iota.mlir"
+    "log.mlir"
+    "log_plus_one.mlir"
+    "maximum.mlir"
+    "minimum.mlir"
+    "multiply.mlir"
+    "negate.mlir"
+    "pad.mlir"
+    "pow.mlir"
+    "reduce.mlir"
+    "reduce_window.mlir"
+    "remainder.mlir"
+    "reshape.mlir"
+    "reverse.mlir"
+    "rng_normal.mlir"
+    "rng_uniform.mlir"
+    "rsqrt.mlir"
+    "scatter.mlir"
+    "scatter_dynamic.mlir"
+    "select.mlir"
+    "sine.mlir"
+    "slice.mlir"
+    "sort.mlir"
+    "sqrt.mlir"
+    "subtract.mlir"
+    "tanh.mlir"
+    "torch_index_select.mlir"
+    "transpose.mlir"
+    "while.mlir"
+  TARGET_BACKEND
+    "cuda"
+  DRIVER
+    "cuda"
+  COMPILER_FLAGS
+    "-iree-input-type=mhlo"
+  RUNNER_ARGS
+    "--cuda_use_streams=false"
+  LABELS
+    "noasan"
+    "nomsan"
+    "notsan"
+    "noubsan"
+    "requires-gpu-nvidia"
+)
+
+iree_check_single_backend_test_suite(
+  NAME
+    check_cuda_streams
+  SRCS
+    "abs.mlir"
+    "add.mlir"
+    "batch_norm_inference.mlir"
+    "bitcast_convert.mlir"
+    "broadcast.mlir"
+    "broadcast_add.mlir"
+    "broadcast_in_dim.mlir"
+    "clamp.mlir"
+    "compare.mlir"
+    "concatenate.mlir"
+    "constant.mlir"
+    "convert.mlir"
+    "convolution.mlir"
+    "cosine.mlir"
+    "divide.mlir"
+    "dot.mlir"
+    "dot_general.mlir"
+    "dynamic_slice.mlir"
+    "dynamic_update_slice.mlir"
+    "exponential.mlir"
+    "exponential_fp16.mlir"
+    "exponential_minus_one.mlir"
+    "fft.mlir"
+    "finite.mlir"
+    "floor.mlir"
+    "gather.mlir"
+    "iota.mlir"
+    "log.mlir"
+    "log_plus_one.mlir"
+    "maximum.mlir"
+    "minimum.mlir"
+    "multiply.mlir"
+    "negate.mlir"
+    "pad.mlir"
+    "pow.mlir"
+    "reduce.mlir"
+    "reduce_window.mlir"
+    "remainder.mlir"
+    "reshape.mlir"
+    "reverse.mlir"
+    "rng_normal.mlir"
+    "rng_uniform.mlir"
+    "rsqrt.mlir"
+    "scatter.mlir"
+    "scatter_dynamic.mlir"
+    "select.mlir"
+    "sine.mlir"
+    "slice.mlir"
+    "sort.mlir"
+    "sqrt.mlir"
+    "subtract.mlir"
+    "tanh.mlir"
+    "torch_index_select.mlir"
+    "transpose.mlir"
+    "while.mlir"
+  TARGET_BACKEND
+    "cuda"
+  DRIVER
+    "cuda"
+  COMPILER_FLAGS
+    "-iree-input-type=mhlo"
+  RUNNER_ARGS
+    "--cuda_use_streams=true"
+  LABELS
+    "noasan"
+    "nomsan"
+    "notsan"
+    "noubsan"
+    "requires-gpu-nvidia"
+)
+
+iree_check_single_backend_test_suite(
+  NAME
     check_dylib-llvm-aot_dylib
   SRCS
     "abs.mlir"

diff --git a/iree/tools/BUILD b/iree/tools/BUILD
index bba42ff..694999f 100644
--- a/iree/tools/BUILD
+++ b/iree/tools/BUILD

@@ -365,6 +365,7 @@
     tags = ["hostonly"],
     deps = [
         "//build_tools:default_linkopts",
+        "//iree/compiler/Utils",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:TableGen",
         "@llvm-project//mlir:MlirTableGenMain",

diff --git a/iree/tools/CMakeLists.txt b/iree/tools/CMakeLists.txt
index 3c6caff..833970d 100644
--- a/iree/tools/CMakeLists.txt
+++ b/iree/tools/CMakeLists.txt

@@ -19,31 +19,31 @@
 # Enable compiler targets based on options.
 set(IREE_COMPILER_TARGETS "")
 set(IREE_COMPILER_TARGET_COPTS "")
-if("${IREE_TARGET_BACKEND_DYLIB_LLVM_AOT}" OR "${IREE_TARGET_BACKEND_WASM_LLVM_AOT}")
+if(IREE_TARGET_BACKEND_DYLIB_LLVM_AOT OR IREE_TARGET_BACKEND_WASM_LLVM_AOT)
   list(APPEND IREE_COMPILER_TARGETS iree::compiler::Dialect::HAL::Target::LLVM)
   list(APPEND IREE_COMPILER_TARGET_COPTS "-DIREE_HAVE_LLVMAOT_TARGET")
 endif()
-if("${IREE_TARGET_BACKEND_METAL_SPIRV}")
+if(IREE_TARGET_BACKEND_METAL_SPIRV)
   list(APPEND IREE_COMPILER_TARGETS iree::compiler::Dialect::HAL::Target::MetalSPIRV)
   list(APPEND IREE_COMPILER_TARGET_COPTS "-DIREE_HAVE_METALSPIRV_TARGET")
 endif()
-if("${IREE_TARGET_BACKEND_VMVX}")
+if(IREE_TARGET_BACKEND_VMVX)
   list(APPEND IREE_COMPILER_TARGETS iree::compiler::Dialect::HAL::Target::VMVX)
   list(APPEND IREE_COMPILER_TARGET_COPTS "-DIREE_HAVE_VMVX_TARGET")
 endif()
-if("${IREE_TARGET_BACKEND_VULKAN_SPIRV}")
+if(IREE_TARGET_BACKEND_VULKAN_SPIRV)
   list(APPEND IREE_COMPILER_TARGETS iree::compiler::Dialect::HAL::Target::VulkanSPIRV)
   list(APPEND IREE_COMPILER_TARGET_COPTS "-DIREE_HAVE_VULKANSPIRV_TARGET")
 endif()
-if("${IREE_TARGET_BACKEND_WEBGPU}")
+if(IREE_TARGET_BACKEND_WEBGPU)
   list(APPEND IREE_COMPILER_TARGETS iree::compiler::Dialect::HAL::Target::WebGPU)
   list(APPEND IREE_COMPILER_TARGET_COPTS "-DIREE_HAVE_WEBGPU_TARGET")
 endif()
-# if("${IREE_TARGET_BACKEND_CUDA}")
-#   list(APPEND IREE_COMPILER_TARGETS iree::compiler::Dialect::HAL::Target::CUDA)
-#   list(APPEND IREE_COMPILER_TARGET_COPTS "-DIREE_HAVE_CUDA_TARGET")
-# endif()
-if("${IREE_TARGET_BACKEND_ROCM}")
+if(IREE_TARGET_BACKEND_CUDA)
+  list(APPEND IREE_COMPILER_TARGETS iree::compiler::Dialect::HAL::Target::CUDA)
+  list(APPEND IREE_COMPILER_TARGET_COPTS "-DIREE_HAVE_CUDA_TARGET")
+endif()
+if(IREE_TARGET_BACKEND_ROCM)
   list(APPEND IREE_COMPILER_TARGETS iree::compiler::Dialect::HAL::Target::ROCM)
   list(APPEND IREE_COMPILER_TARGET_COPTS "-DIREE_HAVE_ROCM_TARGET")
 endif()
@@ -205,6 +205,7 @@
       LLVMTableGen
       MLIRSupport
       MLIRTableGen
+      iree::compiler::Utils
     HOSTONLY
   )
 

diff --git a/iree/tools/init_mlir_dialects.h b/iree/tools/init_mlir_dialects.h
index 4d9ec16..3976875 100644
--- a/iree/tools/init_mlir_dialects.h
+++ b/iree/tools/init_mlir_dialects.h

@@ -28,7 +28,7 @@
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Tensor/IR/TensorInferTypeOpInterfaceImpl.h"
 #include "mlir/Dialect/Tosa/IR/TosaOps.h"
-#include "mlir/Dialect/Vector/VectorOps.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/Dialect.h"
 
 #ifdef IREE_HAVE_EMITC_DIALECT

diff --git a/iree/tools/iree-run-mlir-main.cc b/iree/tools/iree-run-mlir-main.cc
index 6ce03f3..3cdaf53 100644
--- a/iree/tools/iree-run-mlir-main.cc
+++ b/iree/tools/iree-run-mlir-main.cc

@@ -194,7 +194,7 @@
   // Parse input MLIR module.
   llvm::SourceMgr source_mgr;
   source_mgr.AddNewSourceBuffer(std::move(file_buffer), llvm::SMLoc());
-  mlir::OwningModuleRef mlir_module =
+  mlir::OwningOpRef<mlir::ModuleOp> mlir_module =
       mlir::parseSourceFile(source_mgr, &context);
   if (!mlir_module) {
     return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,

diff --git a/iree/tools/iree_translate_lib.cc b/iree/tools/iree_translate_lib.cc
index 0fe1b8e..b130696 100644
--- a/iree/tools/iree_translate_lib.cc
+++ b/iree/tools/iree_translate_lib.cc

@@ -103,6 +103,15 @@
     return 1;
   }
 
+  // The value is required in processBuffer but if Required option is set on
+  // flag above then there is an error reported per possible translation rather
+  // than single one, so check explicitly instead.
+  if (!translationRequested) {
+    llvm::errs()
+        << "Translation to perform option: must be specified at least once!\n";
+    return 1;
+  }
+
   /// Processes the memory buffer with a new MLIRContext.
   auto processBuffer = [&](std::unique_ptr<llvm::MemoryBuffer> ownedBuffer,
                            llvm::raw_ostream &os) {

diff --git a/llvm-external-projects/iree-compiler-api/pyproject.toml b/llvm-external-projects/iree-compiler-api/pyproject.toml
index 268cc91..a5ba7a4 100644
--- a/llvm-external-projects/iree-compiler-api/pyproject.toml
+++ b/llvm-external-projects/iree-compiler-api/pyproject.toml

@@ -1,15 +1,15 @@
-[build-system]
-requires = [
-    "setuptools>=42",
-    "wheel",
-    # There is no fundamental reason to pin this CMake version, beyond
-    # build stability.
-    "cmake==3.18",
-    "ninja==1.10.2",
-    # MLIR build depends.
-    "numpy",
-    # Version 2.7.0 excluded: https://github.com/pybind/pybind11/issues/3136
-    "pybind11>=2.6.0,!=2.7.0",
-    "PyYAML",
-]
-build-backend = "setuptools.build_meta"
+[build-system]

+requires = [

+    "setuptools>=42",

+    "wheel",

+    # There is no fundamental reason to pin this CMake version, beyond

+    # build stability.

+    "cmake==3.22.2",

+    "ninja==1.10.2",

+    # MLIR build depends.

+    "numpy",

+    # Version 2.7.0 excluded: https://github.com/pybind/pybind11/issues/3136

+    "pybind11>=2.6.0,!=2.7.0",

+    "PyYAML",

+]

+build-backend = "setuptools.build_meta"


diff --git a/llvm-external-projects/iree-compiler-api/setup.py b/llvm-external-projects/iree-compiler-api/setup.py
index fbb0a2c..c1ab146 100644
--- a/llvm-external-projects/iree-compiler-api/setup.py
+++ b/llvm-external-projects/iree-compiler-api/setup.py

@@ -91,6 +91,7 @@
     cfg = "Release"
     cmake_args = [
         "-GNinja",
+        "--log-level=VERBOSE",
         "-DCMAKE_INSTALL_PREFIX={}".format(cmake_install_dir),
         "-DPython3_EXECUTABLE={}".format(sys.executable),
         "-DPython3_INCLUDE_DIRS={}".format(sysconfig.get_path("include")),
@@ -98,6 +99,11 @@
         "-DCMAKE_BUILD_TYPE={}".format(cfg),
     ]
 
+    # Enable CUDA if specified.
+    cuda_target_option = os.getenv("IREE_TARGET_BACKEND_CUDA")
+    if cuda_target_option:
+      cmake_args.append(f"-DIREE_TARGET_BACKEND_CUDA={cuda_target_option}")
+
     build_args = []
     if os.path.exists(cmake_install_dir):
       shutil.rmtree(cmake_install_dir)

diff --git a/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/Input/InputBase.td b/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/Input/InputBase.td
index 6ed641b..3990bd8 100644
--- a/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/Input/InputBase.td
+++ b/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/Input/InputBase.td

@@ -28,9 +28,9 @@
   let cppNamespace = "::mlir::iree_compiler::IREE::Input";
 }
 
-class IREEInput_Op<string mnemonic, list<OpTrait> traits = []> :
+class IREEInput_Op<string mnemonic, list<Trait> traits = []> :
     Op<IREEInput_Dialect, mnemonic, traits>;
-class IREEInput_PureOp<string mnemonic, list<OpTrait> traits = []> :
+class IREEInput_PureOp<string mnemonic, list<Trait> traits = []> :
     Op<IREEInput_Dialect, mnemonic, !listconcat(traits, [NoSideEffect])>;
 class IREEInput_Type<string name> : TypeDef<IREEInput_Dialect, name>;
 

diff --git a/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.td b/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.td
index f149e68..4eb9d08 100644
--- a/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.td
+++ b/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.td

@@ -17,11 +17,11 @@
 // Base class.
 //===----------------------------------------------------------------------===//
 
-class IREELinalgExt_PureOp<string mnemonic, list<OpTrait> traits = []> :
+class IREELinalgExt_PureOp<string mnemonic, list<Trait> traits = []> :
     Op<IREELinalgExt_Dialect, mnemonic, traits> {
 }
 
-class IREELinalgExt_Op<string mnemonic, list<OpTrait> traits = []> :
+class IREELinalgExt_Op<string mnemonic, list<Trait> traits = []> :
     IREELinalgExt_PureOp<mnemonic, !listconcat(traits,
         [AttrSizedOperandSegments,
          DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,

diff --git a/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/PyDM/IR/PyDMBase.td b/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/PyDM/IR/PyDMBase.td
index 96ebfbf..4f20e1d 100644
--- a/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/PyDM/IR/PyDMBase.td
+++ b/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/PyDM/IR/PyDMBase.td

@@ -33,12 +33,12 @@
   let hasConstantMaterializer = 1;
 }
 
-class IREEPyDM_Op<string mnemonic, list<OpTrait> traits = []> :
+class IREEPyDM_Op<string mnemonic, list<Trait> traits = []> :
     Op<IREEPyDM_Dialect, mnemonic, traits> {
   let verifier = [{ return ::verify(*this); }];
 }
 
-class IREEPyDM_PureOp<string mnemonic, list<OpTrait> traits = []> :
+class IREEPyDM_PureOp<string mnemonic, list<Trait> traits = []> :
     Op<IREEPyDM_Dialect, mnemonic, !listconcat(traits, [NoSideEffect])> {
   let verifier = [{ return ::verify(*this); }];
 }

diff --git a/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/PyDM/IR/PyDMOps.td b/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/PyDM/IR/PyDMOps.td
index cce9fb0..f6dba20 100644
--- a/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/PyDM/IR/PyDMOps.td
+++ b/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/PyDM/IR/PyDMOps.td

@@ -159,7 +159,7 @@
       return getType().getResult(1);
     }
 
-    /// Hook for OpTrait::FunctionLike, called after verifying that the 'type'
+    /// Hook for Trait::FunctionLike, called after verifying that the 'type'
     /// attribute is present. This can check for preconditions of the
     /// getNumArguments hook not failing.
     LogicalResult verifyType();

diff --git a/llvm-external-projects/iree-dialects/lib/Dialect/PyDM/Transforms/RTL/CMakeLists.txt b/llvm-external-projects/iree-dialects/lib/Dialect/PyDM/Transforms/RTL/CMakeLists.txt
index 08392f8..6c120a2 100644
--- a/llvm-external-projects/iree-dialects/lib/Dialect/PyDM/Transforms/RTL/CMakeLists.txt
+++ b/llvm-external-projects/iree-dialects/lib/Dialect/PyDM/Transforms/RTL/CMakeLists.txt

@@ -11,6 +11,7 @@
   IREEPyDMDialect
   MLIRIR
   MLIRParser
+  MLIRStandard
   MLIRTransformUtils
 )
 

diff --git a/llvm-external-projects/iree-dialects/lib/Dialect/PyDM/Transforms/RTL/LinkRTLPass.cpp b/llvm-external-projects/iree-dialects/lib/Dialect/PyDM/Transforms/RTL/LinkRTLPass.cpp
index f61214a..565fa69 100644
--- a/llvm-external-projects/iree-dialects/lib/Dialect/PyDM/Transforms/RTL/LinkRTLPass.cpp
+++ b/llvm-external-projects/iree-dialects/lib/Dialect/PyDM/Transforms/RTL/LinkRTLPass.cpp

@@ -53,12 +53,14 @@
       // Parse from inline asm.
       auto owningOp = parseSourceString(*localSource.asmBlob, context);
       if (!owningOp) return failure();
-      rtlModule = std::make_shared<OwningModuleRef>(std::move(owningOp));
+      rtlModule = std::make_shared<mlir::OwningOpRef<mlir::ModuleOp>>(
+          std::move(owningOp));
     } else if (localSource.asmFilePath) {
       // Parse from a file.
       auto owningOp = parseSourceFile(*localSource.asmFilePath, context);
       if (!owningOp) return failure();
-      rtlModule = std::make_shared<OwningModuleRef>(std::move(owningOp));
+      rtlModule = std::make_shared<mlir::OwningOpRef<mlir::ModuleOp>>(
+          std::move(owningOp));
     } else {
       return emitError(UnknownLoc::get(context))
              << "pass " << getArgument()
@@ -198,7 +200,7 @@
   }
 
   // Really, this is the best option for this kind of thing.
-  std::shared_ptr<OwningModuleRef> rtlModule;
+  std::shared_ptr<mlir::OwningOpRef<mlir::ModuleOp>> rtlModule;
 
   // A SymbolTable for each sub module.
   SmallVector<SymbolTable> importModules;

diff --git a/experimental/sample_web_static/local_server.py b/scripts/local_web_server.py
similarity index 100%
rename from experimental/sample_web_static/local_server.py
rename to scripts/local_web_server.py


diff --git a/third_party/llvm-project b/third_party/llvm-project
index 07ad054..b82a3a8 160000
--- a/third_party/llvm-project
+++ b/third_party/llvm-project

@@ -1 +1 @@
-Subproject commit 07ad054a724dc1fda57a0e4f90358273586e8201
+Subproject commit b82a3a8ef3843a8b98807219db87bc12f38ed704

diff --git a/third_party/mlir-hlo b/third_party/mlir-hlo
index 496a134..6bbe110 160000
--- a/third_party/mlir-hlo
+++ b/third_party/mlir-hlo

@@ -1 +1 @@
-Subproject commit 496a134541f8483e1c01ab488bc416b9780df63a
+Subproject commit 6bbe110606f246b63a49e5e793e8cb6ed2ac1dd2
commit	82dde83aba50e00b0e8570e41bdf6734e34373c9	[log] [tgz]
author	Ben Vanik <benvanik@google.com>	Tue Feb 08 19:55:23 2022 -0800
committer	GitHub <noreply@github.com>	Tue Feb 08 19:55:23 2022 -0800
tree	264f6624eb5926c65ac2b1b5db4bd14b0ee665f5
parent	8c8d4a85cc010c40562fd5c447df9ca180c93049 [diff]
parent	d4bd1635a9018a73e5a61e939efac459d7b809f4 [diff]