Merge pull request #2568 from google/benvanik-flatcc-vm

Refactoring bytecode_module to use flatcc instead of flatbuffers C++.
diff --git a/.github/workflows/google_to_main.yml b/.github/workflows/google_to_main.yml
deleted file mode 100644
index dd85178..0000000
--- a/.github/workflows/google_to_main.yml
+++ /dev/null
@@ -1,62 +0,0 @@
-# Copyright 2020 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Creates a PR to merge the `google` branch into the `main` branch.
-
-name: google -> main
-
-on:
-  schedule:
-    # Every 6 hours at 45 minutes past the hour (to be a bit behind the TF submodule update)
-    # 00:45, 06:45, 12:45, 18:45 UTC (04:45, 10:45, 16:45, 22:45 PST)
-    - cron: '45 */6 * * *'
-
-jobs:
-  google_to_main:
-    # Don't run this in everyone's forks.
-    if: github.repository == 'google/iree'
-    runs-on: ubuntu-18.04
-    steps:
-      - name: Checking out repository
-        uses: actions/checkout@v2
-        with:
-          ref: "google"
-      # We have to explicitly fetch the main branch as well
-      - name: Fetching Base Branch
-        run: git fetch --no-tags --prune --depth=1 origin main
-      - name: Checking for a diff
-        run: |
-          echo "::set-env name=has_diff::false"
-          git diff main --exit-code || echo "::set-env name=has_diff::true"
-      - name: Calculating PR body
-        if: env.has_diff == 'true'
-        run: |
-          echo "::set-env name=pr_body::$(git log main.. --decorate=no --pretty='format:* %h %<(80,trunc)%s')"
-      - name: Initializing submodules
-        if: env.has_diff == 'true'
-        run: ./scripts/git/submodule_versions.py init
-      - name: Creating Pull Request
-        if: env.has_diff == 'true'
-        uses: peter-evans/create-pull-request@v2
-        with:
-          # Personal token is required to trigger additional automation (e.g. presubmits).
-          token: ${{ secrets.GITHUB_WRITE_ACCESS_TOKEN }}
-          commit-message: "Merge google -> main"
-          title: "Merge google -> main"
-          body: "${{ env.pr_body }}"
-          committer: "Integrate Action <iree-github-actions-bot@google.com>"
-          # TODO(gcmn): Figure out a way to assign this to someone dynamically.
-          reviewers: gmngeoffrey
-          branch: "google-to-main"
-          base: "main"
diff --git a/.github/workflows/publish_docs.yml b/.github/workflows/publish_docs.yml
index 123f5a4..457da97 100644
--- a/.github/workflows/publish_docs.yml
+++ b/.github/workflows/publish_docs.yml
@@ -33,9 +33,6 @@
         uses: actions/checkout@v2
         with:
           token: ${{ secrets.GITHUB_WRITE_ACCESS_TOKEN }}
-      - name: Fetching gh-pages branch
-        run: |
-          git fetch origin gh-pages
       - name: Initializing submodules
         run: ./scripts/git/submodule_versions.py init
       - name: Installing Ninja build
@@ -45,6 +42,9 @@
           ./build_tools/cmake/build_docs.sh
           # Patch the MarkDown files with front matter for rendering
           ./scripts/prepare_doc_publication.py ${IREE_DOC_BUILD_DIR}/doc
+      - name: Fetching gh-pages branch
+        run: |
+          git fetch origin gh-pages
       - name: Updating gh-pages branch
         run: |
           git checkout -f gh-pages
diff --git a/.github/workflows/update_tf.yml b/.github/workflows/update_tf.yml
index 86c55d9..b116f99 100644
--- a/.github/workflows/update_tf.yml
+++ b/.github/workflows/update_tf.yml
@@ -54,6 +54,4 @@
 
             Automated submodule bump from .github/workflows/update_tf.yml
           committer: "Submodule Update Action <iree-github-actions-bot@google.com>"
-          # TODO(gcmn): Figure out a way to assign this to someone dynamically.
-          reviewers: gmngeoffrey
           branch: "auto_submodule_update"
diff --git a/.style.yapf b/.style.yapf
new file mode 100644
index 0000000..9ef1dc1
--- /dev/null
+++ b/.style.yapf
@@ -0,0 +1,4 @@
+[style]
+  based_on_style = google
+  column_limit = 80
+  indent_width = 2
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 359d641..1a2377d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -93,6 +93,10 @@
 
 if( IREE_HAL_DRIVERS_TO_BUILD STREQUAL "all" )
   set( IREE_HAL_DRIVERS_TO_BUILD ${IREE_ALL_HAL_DRIVERS} )
+  # For cross compilation towords Android, we don't want LLVM JIT HAL driver.
+  if(ANDROID)
+    list(REMOVE_ITEM IREE_HAL_DRIVERS_TO_BUILD LLVM)
+  endif()
 endif()
 message(STATUS "Building HAL drivers ${IREE_HAL_DRIVERS_TO_BUILD}")
 
@@ -112,8 +116,8 @@
 # List of all target backends to be built by default:
 set(IREE_ALL_TARGET_BACKENDS
   # TODO(scotttodd): LLVMAOT
-  LLVMIR
-  Vulkan_SPIRV
+  LLVM-IR
+  Vulkan-SPIRV
   VMLA
 )
 
@@ -202,7 +206,6 @@
 include(iree_cc_embed_data)
 include(iree_bytecode_module)
 include(iree_multipy)
-include(iree_py_test)
 include(iree_lit_test)
 include(iree_add_all_subdirs)
 include(iree_check_test)
@@ -287,6 +290,8 @@
     message(STATUS "Adding bundled LLVM source dependency")
     add_iree_mlir_src_dep("third_party/llvm-project")
   elseif(${IREE_MLIR_DEP_MODE} STREQUAL "INSTALLED")
+    # Deps of installed MLIR/LLVM packages.
+    find_package(ZLIB)  # See: https://reviews.llvm.org/D79219
     message(STATUS "Looking for installed MLIR/LLVM packages (configure with MLIR_DIR variable)")
     find_package(MLIR REQUIRED CONFIG)
     message(STATUS "Using MLIRConfig.cmake in: ${MLIR_DIR}")
@@ -325,7 +330,8 @@
   find_package(PythonInterp 3 REQUIRED)
 endif()
 if(${IREE_BUILD_PYTHON_BINDINGS})
-  find_package(PythonLibs 3 REQUIRED)
+  # Note: Optional because python libs can be manually specified.
+  find_package(PythonLibs 3)
 endif()
 
 list(APPEND CMAKE_MODULE_PATH
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index a906067..0f7897d 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -58,7 +58,7 @@
 ## Peculiarities
 
 Our documentation on
-[repository management](https://github.com/google/iree/blob/main/docs/repository_management.md)
+[repository management](https://github.com/google/iree/blob/main/docs/developing_iree/repository_management.md)
 has more information on some of the oddities in our repository setup and
 workflows. For the most part, these should be transparent to normal developer
 workflows.
diff --git a/README.md b/README.md
index e6de0e2..129547c 100644
--- a/README.md
+++ b/README.md
@@ -37,8 +37,8 @@
 working on enabling macOS support. For deployment, IREE aims to additionally
 cover Android and iOS.
 
-Please see the [Getting Started](https://google.github.io/iree/GetStarted) pages
-on IREE's [documentation hub](https://google.github.io/iree) to configure,
+Please see the [Getting Started](https://google.github.io/iree/get-started)
+pages on IREE's [documentation hub](https://google.github.io/iree) to configure,
 compile, and run IREE in your favorite development environment!
 
 ## Documentation and Talks
@@ -68,7 +68,7 @@
 
 The architecture of IREE is best illustrated by the following picture:
 
-![IREE Architecture](./docs/IREE-Architecture.svg)
+![IREE Architecture](./docs/iree_architecture.svg)
 
 Being compilation-based means IREE does not have a traditional runtime that
 dispatches "ops" to their fat kernel implementations. What IREE provides is a
@@ -100,8 +100,8 @@
 ## Roadmap and Milestones
 
 IREE is still at its early stage; we have lots of exciting future plans. Please
-check out the [long-term design roadmap](./docs/roadmap_design.md) and
-[short-term focus areas](./docs/roadmap.md).
+check out the [long-term design roadmap](./docs/design_roadmap.md) and
+[short-term focus areas](./docs/milestones.md).
 
 We use [GitHub Projects](https://github.com/google/iree/projects) to track
 various IREE components and
diff --git a/SUBMODULE_VERSIONS b/SUBMODULE_VERSIONS
index ba8dd45..d74192b 100644
--- a/SUBMODULE_VERSIONS
+++ b/SUBMODULE_VERSIONS
@@ -4,15 +4,15 @@
 a5d9d0f7d368054fd1691aedf1db4116efcc233e third_party/flatbuffers
 4fb0ff7069bd88ee85902f4d0bb62794e5f6d021 third_party/flatcc
 f2fb48c3b3d79a75a88a99fba6576b25d42ec528 third_party/googletest
-7ca9b589c45302feb28c0b3b0e80088c0901bb40 third_party/llvm-project
+eed333149d178b69fdaf39b9419b7ca032520182 third_party/llvm-project
 17b12a4481daa150e2d1ea3ada086b551b856707 third_party/marl
-67f3ccebee84f3488b46a8d3ac005178c52ff264 third_party/mlir-emitc
+80885f899e12d55a45561ef758eea47bb340dbf1 third_party/mlir-emitc
 80d452484c5409444b0ec19383faa84bb7a4d351 third_party/pybind11
 9f53ba413e6fc879236dcaa3e008915973d67a4f third_party/ruy
-b73f111094da3e380a1774b56b15f16c90ae8e23 third_party/sdl2
+a1390ed39ec77ecfb574bc6fcd5bfc5e3adbdea9 third_party/sdl2
 f8bf11a0253a32375c32cad92c841237b96696c0 third_party/spirv_headers
 57eb48aed36160c4876bc8310d9ca84d42ee9e2a third_party/swiftshader
-e36aca0132fbcde0bc820d56185e3078f97a879d third_party/tensorflow
+e29e1f4e574caab071e93cfb91fa9ee0944cd87c third_party/tensorflow
 864d86e8b6d21449474db5e9313dbff90aa9c24f third_party/tracy
 9bd3f561bcee3f01d22912de10bb07ce4e23d378 third_party/vulkan_headers
 909f36b714c9239ee0b112a321220213a474ba53 third_party/vulkan_memory_allocator
diff --git a/WORKSPACE b/WORKSPACE
index 702a087..cf098ce 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -63,7 +63,7 @@
 rbe_autoconfig(
     name = "rbe_default",
     base_container_digest = "sha256:1a8ed713f40267bb51fe17de012fa631a20c52df818ccb317aaed2ee068dfc61",
-    digest = "sha256:b59d8cc422b03524394d4d05e443bf38d4fe96fab06197b34174de01572e8161",
+    digest = "sha256:bc2d61ad05453928e67b434ae019e7d050dda46c091270f2b81b2f09da2276ce",
     registry = "gcr.io",
     repository = "iree-oss/rbe-toolchain",
     use_checked_in_confs = "Force",
diff --git a/bindings/python/build_tools/cmake/iree_py_test.cmake b/bindings/python/build_tools/cmake/iree_py_test.cmake
deleted file mode 100644
index 0fbb6f0..0000000
--- a/bindings/python/build_tools/cmake/iree_py_test.cmake
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright 2020 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-include(CMakeParseArguments)
-
-# iree_py_test()
-#
-# CMake function to imitate Bazel's iree_py_test rule.
-#
-# Parameters:
-# NAME: name of test
-# SRCS: List of source file
-# DEPS: List of deps the test requires
-# LABELS: Additional labels to apply to the test. The package path is added
-#     automatically.
-
-function(iree_py_test)
-  if(NOT IREE_BUILD_TESTS)
-    return()
-  endif()
-
-  cmake_parse_arguments(
-    _RULE
-    ""
-    "NAME"
-    "SRCS;DEPS;LABELS"
-    ${ARGN}
-  )
-
-  iree_package_name(_PACKAGE_NAME)
-  set(_NAME "${_PACKAGE_NAME}_${_RULE_NAME}")
-
-  iree_package_ns(_PACKAGE_NS)
-  string(REPLACE "::" "/" _PACKAGE_PATH ${_PACKAGE_NS})
-  set(_NAME_PATH "${_PACKAGE_PATH}:${_RULE_NAME}")
-
-  add_test(
-    NAME ${_NAME_PATH}
-    COMMAND ${CMAKE_SOURCE_DIR}/build_tools/cmake/run_test.sh ${PYTHON_EXECUTABLE} "${CMAKE_CURRENT_SOURCE_DIR}/${_RULE_SRCS}"
-    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
-  )
-
-  list(APPEND _RULE_LABELS "${_PACKAGE_PATH}")
-  set_property(TEST ${_NAME_PATH} PROPERTY LABELS "${_RULE_LABELS}")
-  set_property(TEST ${_NAME_PATH} PROPERTY ENVIRONMENT "PYTHONPATH=${CMAKE_BINARY_DIR}/bindings/python:$ENV{PYTHONPATH};TEST_TMPDIR=${_NAME}_test_tmpdir")
-  # TODO(marbre): Find out how to add deps to tests.
-  #               Similar to _RULE_DATA in iree_lit_test().
-
-endfunction()
diff --git a/bindings/python/build_tools/python/generate_build.py b/bindings/python/build_tools/python/generate_build.py
index 6d8cff4..6705cfd 100644
--- a/bindings/python/build_tools/python/generate_build.py
+++ b/bindings/python/build_tools/python/generate_build.py
@@ -64,7 +64,7 @@
 
 cc_library(
     name = "python_headers",
-    hdrs = glob(["include/*.h"]),
+    hdrs = glob(["include/**/*.h"]),
     srcs = [{extra_srcs}],
     includes = ["include"],
     linkopts = [],
diff --git a/bindings/python/pyiree/compiler/compiler.cc b/bindings/python/pyiree/compiler/compiler.cc
index cfd8289..bb356f9 100644
--- a/bindings/python/pyiree/compiler/compiler.cc
+++ b/bindings/python/pyiree/compiler/compiler.cc
@@ -337,7 +337,7 @@
   mlir::applyPassManagerCLOptions(pass_manager);
   auto crash_reproducer_path = context_->crash_reproducer_path();
   if (crash_reproducer_path) {
-    pass_manager.enableCrashReproducerGeneration(*crash_reproducer_path);
+    pass_manager.enableCrashReproducerGeneration(*crash_reproducer_path, true);
   }
 
   mlir::iree_compiler::IREE::HAL::TargetOptions executable_options;
diff --git a/bindings/python/pyiree/rt/system_api.py b/bindings/python/pyiree/rt/system_api.py
index 555ce32..aaea01f 100644
--- a/bindings/python/pyiree/rt/system_api.py
+++ b/bindings/python/pyiree/rt/system_api.py
@@ -81,14 +81,12 @@
       device = driver.create_default_device()
     except Exception as ex:
       print(
-          "Could not create default driver device %s: %r" % (
-              driver_name, ex),
+          "Could not create default driver device %s: %r" % (driver_name, ex),
           file=sys.stderr)
       driver_exceptions[driver_name] = ex
       continue
 
-    print(
-        "Created IREE driver %s: %r" % (driver_name, driver), file=sys.stderr)
+    print("Created IREE driver %s: %r" % (driver_name, driver), file=sys.stderr)
     return driver
 
   # All failed.
diff --git a/build_tools/bazel/build_tensorflow.sh b/build_tools/bazel/build_tensorflow.sh
index 07b0eb8..be82227 100755
--- a/build_tools/bazel/build_tensorflow.sh
+++ b/build_tools/bazel/build_tensorflow.sh
@@ -43,6 +43,11 @@
   --test_env=IREE_LLVMJIT_DISABLE=$IREE_LLVMJIT_DISABLE
   --test_env=IREE_VULKAN_DISABLE=$IREE_VULKAN_DISABLE
 )
+# Pass in VK_ICD_FILENAMES if exists so that the Vulkan loader can find the
+# Vulkan implementation.
+if  [[ -v VK_ICD_FILENAMES ]]; then
+  test_env_args+=(--test_env=VK_ICD_FILENAMES=$VK_ICD_FILENAMES)
+fi
 
 declare -a default_build_tag_filters=("-nokokoro")
 declare -a default_test_tag_filters=("-nokokoro")
@@ -87,8 +92,6 @@
         --config=generic_clang \
         --build_tag_filters="${BUILD_TAG_FILTERS?}" \
         --test_tag_filters="${TEST_TAG_FILTERS?}" \
+        --config=rs \
         --test_output=errors \
         --keep_going
-        # TODO: Enable result store once the Kokoro VMs used for this test have the
-        # appropriate auth.
-        # --config=rs
diff --git a/build_tools/bazel/iree_flatcc.bzl b/build_tools/bazel/iree_flatcc.bzl
index c355563..e091af6 100644
--- a/build_tools/bazel/iree_flatcc.bzl
+++ b/build_tools/bazel/iree_flatcc.bzl
@@ -24,19 +24,19 @@
     flatcc_rt = "@com_github_dvidelabs_flatcc//:runtime"
 
     flags = [
-      "-o$(RULEDIR)",
+        "-o$(RULEDIR)",
     ] + flatcc_args
 
     out_stem = "%s" % (srcs[0].replace(".fbs", ""))
 
     outs = []
     for arg in flags:
-      if arg == "--reader":
-        outs += ["%s_reader.h" % (out_stem)]
-      if arg == "--builder":
-        outs += ["%s_builder.h" % (out_stem)]
-      if arg == "--verifier":
-        outs += ["%s_verifier.h" % (out_stem)]
+        if arg == "--reader":
+            outs += ["%s_reader.h" % (out_stem)]
+        if arg == "--builder":
+            outs += ["%s_builder.h" % (out_stem)]
+        if arg == "--verifier":
+            outs += ["%s_verifier.h" % (out_stem)]
 
     native.genrule(
         name = name + "_gen",
@@ -50,7 +50,7 @@
         name = name,
         hdrs = outs,
         deps = [
-          flatcc_rt,
+            flatcc_rt,
         ],
         testonly = testonly,
         **kwargs
diff --git a/build_tools/bazel/third_party_import/llvm-project/overlay/llvm/BUILD.bazel b/build_tools/bazel/third_party_import/llvm-project/overlay/llvm/BUILD.bazel
index bade7ab..befc20c 100644
--- a/build_tools/bazel/third_party_import/llvm-project/overlay/llvm/BUILD.bazel
+++ b/build_tools/bazel/third_party_import/llvm-project/overlay/llvm/BUILD.bazel
@@ -155,10 +155,10 @@
     name = "InstCombineTableGen",
     tbl_outs = [(
         "-gen-searchable-tables",
-        "lib/Transforms/InstCombine/InstCombineTables.inc",
+        "lib/Target/AMDGPU/InstCombineTables.inc",
     )],
     tblgen = ":llvm-tblgen",
-    td_file = "lib/Transforms/InstCombine/InstCombineTables.td",
+    td_file = "lib/Target/AMDGPU/InstCombineTables.td",
     td_srcs = glob([
         "include/llvm/CodeGen/*.td",
         "include/llvm/IR/Intrinsics*.td",
@@ -721,6 +721,7 @@
             "lib/Analysis/*.h",
         ],
         exclude = [
+            "lib/Analysis/DevelopmentModeInlineAdvisor.cpp",
             "lib/Analysis/MLInlineAdvisor.cpp",
             "lib/Analysis/ReleaseModeModelRunner.cpp",
             "lib/Analysis/TFUtils.cpp",
@@ -3187,6 +3188,7 @@
     ]),
     copts = llvm_copts,
     deps = [
+        ":BinaryFormat",
         ":DebugInfoCodeView",
         ":MC",
         ":Object",
diff --git a/build_tools/bazel/third_party_import/llvm-project/overlay/mlir/BUILD.bazel b/build_tools/bazel/third_party_import/llvm-project/overlay/mlir/BUILD.bazel
index ec0574f..7e42a4b 100644
--- a/build_tools/bazel/third_party_import/llvm-project/overlay/mlir/BUILD.bazel
+++ b/build_tools/bazel/third_party_import/llvm-project/overlay/mlir/BUILD.bazel
@@ -387,7 +387,7 @@
         "include/mlir/Interfaces/CallInterfaces.td",
         "include/mlir/Interfaces/ControlFlowInterfaces.td",
         "include/mlir/Interfaces/SideEffectInterfaces.td",
-        "include/mlir/Interfaces/VectorUnrollInterface.td",
+        "include/mlir/Interfaces/VectorInterfaces.td",
         "include/mlir/Interfaces/ViewLikeInterface.td",
         ":OpBaseTdFiles",
     ],
@@ -500,6 +500,7 @@
     deps = [
         ":Affine",
         ":IR",
+        ":Support",
         "@llvm-project//llvm:Support",
     ],
 )
@@ -647,13 +648,13 @@
 )
 
 cc_library(
-    name = "VectorUnrollInterface",
-    srcs = ["lib/Interfaces/VectorUnrollInterface.cpp"],
-    hdrs = ["include/mlir/Interfaces/VectorUnrollInterface.h"],
+    name = "VectorInterfaces",
+    srcs = ["lib/Interfaces/VectorInterfaces.cpp"],
+    hdrs = ["include/mlir/Interfaces/VectorInterfaces.h"],
     includes = ["include"],
     deps = [
         ":IR",
-        ":VectorUnrollInterfaceIncGen",
+        ":VectorInterfacesIncGen",
     ],
 )
 
@@ -739,6 +740,7 @@
         ":MLIRShapeCanonicalizationIncGen",
         ":ShapeOpsIncGen",
         ":SideEffectInterfaces",
+        ":StandardOps",
         ":Support",
         "@llvm-project//llvm:Support",
     ],
@@ -757,32 +759,12 @@
         ":Pass",
         ":SCFDialect",
         ":Shape",
-        ":ShapeToStandardPatternsIncGen",
         ":StandardOps",
         ":Support",
         ":Transforms",
     ],
 )
 
-gentbl(
-    name = "ShapeToStandardPatternsIncGen",
-    strip_include_prefix = "include/mlir/Conversion/ShapeToStandard",
-    tbl_outs = [
-        (
-            "-gen-rewriters",
-            "include/mlir/Conversion/ShapeToStandard/ShapeToStandardPatterns.inc",
-        ),
-    ],
-    tblgen = ":mlir-tblgen",
-    td_file = "lib/Conversion/ShapeToStandard/ShapeToStandardPatterns.td",
-    td_srcs = [
-        ":StdOpsTdFiles",
-        "include/mlir/Dialect/Shape/IR/ShapeBase.td",
-        "include/mlir/Dialect/Shape/IR/ShapeOps.td",
-        "include/mlir/Interfaces/InferTypeOpInterface.td",
-    ],
-)
-
 cc_library(
     name = "ShapeToSCF",
     srcs = glob([
@@ -855,7 +837,7 @@
         ":SideEffectInterfaces",
         ":StandardOpsIncGen",
         ":Support",
-        ":VectorUnrollInterface",
+        ":VectorInterfaces",
         ":ViewLikeInterface",
         "@llvm-project//llvm:Support",
     ],
@@ -918,9 +900,9 @@
         ":SideEffectInterfaces",
         ":StandardOps",
         ":Support",
+        ":VectorInterfaces",
         ":VectorOpsIncGen",
         ":VectorTransformPatternsIncGen",
-        ":VectorUnrollInterface",
         "@llvm-project//llvm:Support",
     ],
 )
@@ -2127,20 +2109,20 @@
 )
 
 gentbl(
-    name = "VectorUnrollInterfaceIncGen",
+    name = "VectorInterfacesIncGen",
     strip_include_prefix = "include",
     tbl_outs = [
         (
             "-gen-op-interface-decls",
-            "include/mlir/Interfaces/VectorUnrollInterface.h.inc",
+            "include/mlir/Interfaces/VectorInterfaces.h.inc",
         ),
         (
             "-gen-op-interface-defs",
-            "include/mlir/Interfaces/VectorUnrollInterface.cpp.inc",
+            "include/mlir/Interfaces/VectorInterfaces.cpp.inc",
         ),
     ],
     tblgen = ":mlir-tblgen",
-    td_file = "include/mlir/Interfaces/VectorUnrollInterface.td",
+    td_file = "include/mlir/Interfaces/VectorInterfaces.td",
     td_srcs = [
         ":OpBaseTdFiles",
     ],
@@ -3271,6 +3253,7 @@
         ":QuantPassIncGen",
         ":SideEffectInterfaces",
         ":StandardOps",
+        ":TransformUtils",
         "@llvm-project//llvm:Support",
     ],
 )
@@ -3586,7 +3569,7 @@
     name = "VectorOpsTdFiles",
     srcs = [
         "include/mlir/Dialect/Vector/VectorOps.td",
-        "include/mlir/Interfaces/VectorUnrollInterface.td",
+        "include/mlir/Interfaces/VectorInterfaces.td",
         ":AffineOpsTdFiles",
         ":OpBaseTdFiles",
     ],
@@ -3716,7 +3699,7 @@
         "include/mlir/Interfaces/ControlFlowInterfaces.h",
         "include/mlir/Interfaces/ControlFlowInterfaces.td",
         "include/mlir/Interfaces/SideEffectInterfaces.td",
-        "include/mlir/Interfaces/VectorUnrollInterface.td",
+        "include/mlir/Interfaces/VectorInterfaces.td",
         "include/mlir/Interfaces/ViewLikeInterface.td",
         "include/mlir/Dialect/LLVMIR/LLVMOpBase.td",
         "include/mlir/Dialect/StandardOps/IR/Ops.td",
diff --git a/build_tools/bazel_to_cmake/bazel_to_cmake_converter.py b/build_tools/bazel_to_cmake/bazel_to_cmake_converter.py
index d2df056..bc40c97 100644
--- a/build_tools/bazel_to_cmake/bazel_to_cmake_converter.py
+++ b/build_tools/bazel_to_cmake/bazel_to_cmake_converter.py
@@ -237,8 +237,8 @@
   def _convert_flatcc_args_block(self, flatcc_args):
     if not flatcc_args:
       return ""
-    flatcc_args = "\n".join([f'    "{flatcc_arg}"'
-                             for flatcc_arg in flatcc_args])
+    flatcc_args = "\n".join(
+        [f'    "{flatcc_arg}"' for flatcc_arg in flatcc_args])
     return f"  FLATCC_ARGS\n{flatcc_args}\n"
 
   def _convert_unimplemented_function(self, function, details=""):
diff --git a/build_tools/cmake/build_docs.sh b/build_tools/cmake/build_docs.sh
index d4a3c84..191ed17 100755
--- a/build_tools/cmake/build_docs.sh
+++ b/build_tools/cmake/build_docs.sh
@@ -49,34 +49,12 @@
 
 cd ${ROOT_DIR?}
 
+cp README.md ${BUILD_DIR}/doc/index.md
+cp -rf docs/* ${BUILD_DIR}/doc/
+
 # Update op_coverage.md
 scripts/update_op_coverage.py ${BUILD_DIR}
 
 # Update e2e_coverage.md
 PYTHON_BIN=`which python3` scripts/update_e2e_coverage.py ${BUILD_DIR}
 
-# Copy a curated list of docs to publish. This is expected to cover all docs
-# under docs/ after they are refreshed.
-
-cp README.md ${BUILD_DIR}/doc/index.md
-cp docs/IREE-Architecture.svg ${BUILD_DIR}/doc/
-
-cp docs/roadmap.md ${BUILD_DIR}/doc/
-cp docs/roadmap_design.md ${BUILD_DIR}/doc/
-cp docs/developer_overview.md ${BUILD_DIR}/doc/
-cp docs/testing_guide.md ${BUILD_DIR}/doc/
-cp docs/iree_community.md ${BUILD_DIR}/doc/
-
-mkdir -p ${BUILD_DIR}/doc/GetStarted/
-cp docs/GetStarted/getting_started_windows_bazel.md ${BUILD_DIR}/doc/GetStarted/
-cp docs/GetStarted/getting_started_windows_cmake.md ${BUILD_DIR}/doc/GetStarted/
-cp docs/GetStarted/getting_started_windows_vulkan.md ${BUILD_DIR}/doc/GetStarted/
-cp docs/GetStarted/getting_started_linux_bazel.md ${BUILD_DIR}/doc/GetStarted/
-cp docs/GetStarted/getting_started_linux_cmake.md ${BUILD_DIR}/doc/GetStarted/
-cp docs/GetStarted/getting_started_linux_vulkan.md ${BUILD_DIR}/doc/GetStarted/
-cp docs/GetStarted/getting_started_macos_bazel.md ${BUILD_DIR}/doc/GetStarted/
-cp docs/GetStarted/getting_started_macos_cmake.md ${BUILD_DIR}/doc/GetStarted/
-cp docs/GetStarted/getting_started_android_cmake.md ${BUILD_DIR}/doc/GetStarted/
-cp docs/GetStarted/getting_started_python.md ${BUILD_DIR}/doc/GetStarted/
-cp docs/GetStarted/generic_vulkan_env_setup.md ${BUILD_DIR}/doc/GetStarted/
-cp docs/GetStarted/cmake_options_and_variables.md ${BUILD_DIR}/doc/GetStarted/
diff --git a/build_tools/cmake/iree_check_test.cmake b/build_tools/cmake/iree_check_test.cmake
index aa794bd..93fd934 100644
--- a/build_tools/cmake/iree_check_test.cmake
+++ b/build_tools/cmake/iree_check_test.cmake
@@ -173,6 +173,17 @@
     ${ARGN}
   )
 
+
+  string(TOUPPER ${_RULE_DRIVER} _UPPERCASE_DRIVER)
+  if(NOT IREE_HAL_DRIVER_${_UPPERCASE_DRIVER})
+    return()
+  endif()
+
+  string(TOUPPER ${_RULE_TARGET_BACKEND} _UPPERCASE_TARGET_BACKEND)
+  if(NOT IREE_TARGET_BACKEND_${_UPPERCASE_TARGET_BACKEND})
+    return()
+  endif()
+
   foreach(_SRC IN LISTS _RULE_SRCS)
     set(_TEST_NAME "${_RULE_NAME}_${_SRC}")
     iree_check_test(
diff --git a/build_tools/cmake/iree_copts.cmake b/build_tools/cmake/iree_copts.cmake
index c39db21..7563c1f 100644
--- a/build_tools/cmake/iree_copts.cmake
+++ b/build_tools/cmake/iree_copts.cmake
@@ -250,3 +250,15 @@
   ${PROJECT_BINARY_DIR}/build_tools/third_party/tensorflow
   ${PROJECT_BINARY_DIR}/build_tools/third_party/tensorflow/tensorflow/compiler/mlir/hlo/include/
 )
+
+#-------------------------------------------------------------------------------
+# Third party: mlir-emitc
+#-------------------------------------------------------------------------------
+
+if(IREE_ENABLE_EMITC)
+  list(APPEND IREE_COMMON_INCLUDE_DIRS
+    ${PROJECT_SOURCE_DIR}/third_party/mlir-emitc/include
+    ${PROJECT_BINARY_DIR}/third_party/mlir-emitc/include
+  )
+  add_definitions(-DIREE_HAVE_EMITC_DIALECT)
+endif()
diff --git a/build_tools/cmake/iree_multipy.cmake b/build_tools/cmake/iree_multipy.cmake
index a53995a..a7fd1f1 100644
--- a/build_tools/cmake/iree_multipy.cmake
+++ b/build_tools/cmake/iree_multipy.cmake
@@ -22,14 +22,17 @@
   # Configure the defaults.
   # Note that this is using the pybind11 configuration vars, which creates
   # a fragile dependency. It would be better to derive these locally.
-  set(IREE_MULTIPY_DEFAULT_INCLUDE_DIRS "${PYTHON_INCLUDE_DIRS}" CACHE INTERNAL "Python include dirs" )
-  set(IREE_MULTIPY_DEFAULT_LIBRARIES "${PYTHON_LIBRARIES}" CACHE INTERNAL "Python libraries")
-  set(IREE_MULTIPY_DEFAULT_PREFIX "${PYTHON_MODULE_PREFIX}" CACHE INTERNAL "Python module prefix")
-  set(IREE_MULTIPY_DEFAULT_SUFFIX "${PYTHON_MODULE_SUFFIX}" CACHE INTERNAL "Python module suffix")
-  set(IREE_MULTIPY_DEFAULT_EXTENSION "${PYTHON_MODULE_EXTENSION}" CACHE INTERNAL "Python module extension")
+  if(PYTHONLIBS_FOUND)
+    set(IREE_MULTIPY_DEFAULT_EXECUTABLE "${PYTHON_EXECUTABLE}" CACHE INTERNAL "Python executable" )
+    set(IREE_MULTIPY_DEFAULT_INCLUDE_DIRS "${PYTHON_INCLUDE_DIRS}" CACHE INTERNAL "Python include dirs" )
+    set(IREE_MULTIPY_DEFAULT_LIBRARIES "${PYTHON_LIBRARIES}" CACHE INTERNAL "Python libraries")
+    set(IREE_MULTIPY_DEFAULT_PREFIX "${PYTHON_MODULE_PREFIX}" CACHE INTERNAL "Python module prefix")
+    set(IREE_MULTIPY_DEFAULT_SUFFIX "${PYTHON_MODULE_SUFFIX}" CACHE INTERNAL "Python module suffix")
+    set(IREE_MULTIPY_DEFAULT_EXTENSION "${PYTHON_MODULE_EXTENSION}" CACHE INTERNAL "Python module extension")
+  endif()
 
   if(IREE_MULTIPY_VERSIONS)
-    set(IREE_MULTIPY_VERSIONS_EFFECTIVE "${IREE_MULTIPY_VERSIONS}")
+    set(IREE_MULTIPY_VERSIONS_EFFECTIVE "${IREE_MULTIPY_VERSIONS}" CACHE INTERNAL "Python extension versions")
   else()
     message(STATUS "Multi-python extension versions not found: using defaults")
     set(IREE_MULTIPY_VERSIONS_EFFECTIVE "DEFAULT" CACHE INTERNAL "Python extension versions")
@@ -39,18 +42,22 @@
   message(STATUS "Multipy extension versions: ${IREE_MULTIPY_VERSIONS_EFFECTIVE}")
   foreach(V ${IREE_MULTIPY_VERSIONS_EFFECTIVE})
     message(STATUS "  - Multipy version ${V}")
+    message(STATUS "    : EXECUTABLE = ${IREE_MULTIPY_${V}_EXECUTABLE}")
     message(STATUS "    : INCLUDE_DIRS = ${IREE_MULTIPY_${V}_INCLUDE_DIRS}")
     message(STATUS "    : LIBRARIES = ${IREE_MULTIPY_${V}_LIBRARIES}")
     message(STATUS "    : PREFIX = ${IREE_MULTIPY_${V}_PREFIX}")
     message(STATUS "    : SUFFIX = ${IREE_MULTIPY_${V}_SUFFIX}")
     message(STATUS "    : EXTENSION = ${IREE_MULTIPY_${V}_EXTENSION}")
 
-    # Only INCLUDE_DIRS and EXTENSION are needed for all configs.
+    # Check for required settings.
     if(NOT IREE_MULTIPY_${V}_INCLUDE_DIRS)
-      message(FATAL "MULTIPY config ${V}: No IREE_MULTIPY_{VER}_INCLUDE_DIRS var")
+      message(FATAL " MULTIPY version ${V}: No IREE_MULTIPY_${VER}_EXECUTABLE var")
+    endif()
+    if(NOT IREE_MULTIPY_${V}_INCLUDE_DIRS)
+      message(FATAL " MULTIPY version ${V}: No IREE_MULTIPY_${VER}_INCLUDE_DIRS var")
     endif()
     if(NOT IREE_MULTIPY_${V}_EXTENSION)
-      message(FATAL "MULTIPY config ${V}: No IREE_MULTIPY_{VER}_EXTENSION var")
+      message(FATAL " MULTIPY version ${V}: No IREE_MULTIPY_${VER}_EXTENSION var")
     endif()
   endforeach()
 endfunction()
@@ -100,7 +107,18 @@
 
   foreach(V ${IREE_MULTIPY_VERSIONS_EFFECTIVE})
     set(VER_NAME "${_NAME}__${V}")
-    add_library(${VER_NAME} SHARED ${ARG_SRCS})
+
+    # If configured to link against libraries, build in SHARED mode (which
+    # disallows undefined symbols). Otherwise, build in MODULE mode, which
+    # does not enforce that. This should naturally do the right thing on
+    # each platform based on whether configured with a list of libraries to
+    # link or not.
+    set(LIBRARY_TYPE MODULE)
+    if(IREE_MULTIPY_${V}_LIBRARIES)
+      set(LIBRARY_TYPE SHARED)
+    endif()
+
+    add_library(${VER_NAME} ${LIBRARY_TYPE} ${ARG_SRCS})
     add_dependencies(${_NAME} ${VER_NAME})
     set_target_properties(
       ${VER_NAME} PROPERTIES
@@ -132,10 +150,10 @@
     # Track target and deps, use in iree_complete_py_extension_link_options() later.
     # See iree_complete_py_extension_link_options() in iree_py_extension.cmake
     # TODO: Move that implementation here.
-    list(TRANSFORM ARG_PYEXT_DEPS APPEND "__${V}")
+    set(TRANSFORMED_PYEXT_DEPS "${ARG_PYEXT_DEPS}")
+    list(TRANSFORM TRANSFORMED_PYEXT_DEPS APPEND "__${V}")
     set_property(GLOBAL APPEND PROPERTY _IREE_PY_EXTENSION_NAMES "${VER_NAME}")
-    set_property(TARGET ${VER_NAME} PROPERTY DIRECT_DEPS ${ARG_DEPS} ${ARG_PYEXT_DEPS})
-
+    set_property(TARGET ${VER_NAME} PROPERTY DIRECT_DEPS ${ARG_DEPS} ${TRANSFORMED_PYEXT_DEPS})
     _alias_iree_pyext_library("${ARG_NAME}" "${V}" ${VER_NAME})
   endforeach()
 endfunction()
@@ -167,11 +185,12 @@
         "${IREE_MULTIPY_${V}_INCLUDE_DIRS}"
         "$<BUILD_INTERFACE:${IREE_COMMON_INCLUDE_DIRS}>"
     )
-    list(TRANSFORM ARG_PYEXT_DEPS APPEND "__${V}")
+    set(TRANSFORMED_PYEXT_DEPS "${ARG_PYEXT_DEPS}")
+    list(TRANSFORM TRANSFORMED_PYEXT_DEPS APPEND "__${V}")
     target_link_libraries(${VER_NAME}
       PUBLIC
         ${ARG_DEPS}
-        ${ARG_PYEXT_DEPS}
+        ${TRANSFORMED_PYEXT_DEPS}
       PRIVATE
         ${IREE_DEFAULT_LINKOPTS}
     )
@@ -248,6 +267,55 @@
     ${name} PROPERTIES CXX_VISIBILITY_PRESET "hidden")
 endfunction()
 
+# iree_py_test()
+#
+# CMake function to imitate Bazel's iree_py_test rule.
+#
+# Parameters:
+# NAME: name of test
+# SRCS: List of source file
+# DEPS: List of deps the test requires
+# LABELS: Additional labels to apply to the test. The package path is added
+#     automatically.
+
+function(iree_py_test)
+  if(NOT IREE_BUILD_TESTS)
+    return()
+  endif()
+
+  cmake_parse_arguments(
+    _RULE
+    ""
+    "NAME"
+    "SRCS;DEPS;LABELS"
+    ${ARGN}
+  )
+
+  iree_package_name(_PACKAGE_NAME)
+  set(_NAME "${_PACKAGE_NAME}_${_RULE_NAME}")
+
+  iree_package_ns(_PACKAGE_NS)
+  string(REPLACE "::" "/" _PACKAGE_PATH ${_PACKAGE_NS})
+  set(_NAME_PATH "${_PACKAGE_PATH}:${_RULE_NAME}")
+  list(APPEND _RULE_LABELS "${_PACKAGE_PATH}")
+
+  foreach(V ${IREE_MULTIPY_VERSIONS_EFFECTIVE})
+    set(VER_NAME "${_NAME_PATH}__${V}")
+    add_test(
+      NAME ${VER_NAME}
+      COMMAND
+        "${CMAKE_SOURCE_DIR}/build_tools/cmake/run_test.${IREE_HOST_SCRIPT_EXT}"
+        "${IREE_MULTIPY_${V}_EXECUTABLE}"
+        "${CMAKE_CURRENT_SOURCE_DIR}/${_RULE_SRCS}"
+      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+    )
+
+    set_property(TEST ${VER_NAME} PROPERTY LABELS "${_RULE_LABELS}")
+    set_property(TEST ${VER_NAME} PROPERTY ENVIRONMENT "PYTHONPATH=${CMAKE_BINARY_DIR}/bindings/python:$ENV{PYTHONPATH};TEST_TMPDIR=${_NAME}_${V}_test_tmpdir")
+    # TODO(marbre): Find out how to add deps to tests.
+    #               Similar to _RULE_DATA in iree_lit_test().
+  endforeach()
+endfunction()
 
 ###############################################################################
 # Always-link/transitive dependency management
@@ -378,7 +446,6 @@
           ${_STANDARD_DEPS}
         PRIVATE
           ${_RULE_LINKOPTS}
-          ${PYTHON_LIBRARY}
       )
     else()
       target_link_libraries(${_NAME}
@@ -389,7 +456,6 @@
           ${_STANDARD_DEPS}
         PRIVATE
           ${_RULE_LINKOPTS}
-          ${PYTHON_LIBRARY}
       )
     endif()
   endforeach(_NAME)
diff --git a/build_tools/cmake/run_android_test.sh b/build_tools/cmake/run_android_test.sh
index 292890c..0c7510d 100755
--- a/build_tools/cmake/run_android_test.sh
+++ b/build_tools/cmake/run_android_test.sh
@@ -35,7 +35,7 @@
 set -x
 set -e
 
-adb push $TEST_EXECUTABLE $TEST_ANDROID_ABS_DIR/$(basename $TEST_EXECUTABLE)
+adb push $TEST_EXECUTABLE $TEST_ANDROID_ABS_DIR/$(basename $TEST_EXECUTABLE) 1>/dev/null
 
 if [ -n "$TEST_DATA" ]; then
   adb push $TEST_DATA $TEST_ANDROID_ABS_DIR/$(basename $TEST_DATA)
diff --git a/build_tools/docker/bazel/Dockerfile b/build_tools/docker/bazel/Dockerfile
index e9a587c..c42521a 100644
--- a/build_tools/docker/bazel/Dockerfile
+++ b/build_tools/docker/bazel/Dockerfile
@@ -27,40 +27,49 @@
 FROM ubuntu:18.04
 WORKDIR /usr/src/iree
 
-RUN apt-get update
-
 # Set environment variables.
 ENV CXX clang++
 ENV CC clang
 ENV PYTHON_BIN /usr/bin/python3
 ENV IREE_LLVMAOT_LINKER_PATH /usr/bin/ld
 
-# Install git for updating IREE's submodules.
-RUN apt-get install -y git
+RUN apt-get update \
+  && apt-get install -y \
+    # git for updating IREE's submodules.
+    git \
+    # utilities for later installations
+    unzip \
+    zip \
+    wget \
+    # core IREE dependencies.
+    clang \
+    libsdl2-dev
+
+# Disable apt-key parse waring. If someone knows how to do whatever the "proper"
+# thing is then feel free. The warning complains about parsing apt-key output,
+# which we're not even doing.
+ARG APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1
 
 # Install Bazel.
 # https://docs.bazel.build/versions/master/install-ubuntu.html
-ARG BAZEL_VERSION=2.1.0
-RUN apt-get install -y unzip zip wget \
-  && wget "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION?}/bazel-${BAZEL_VERSION?}-installer-linux-x86_64.sh" \
-  && chmod +x "bazel-${BAZEL_VERSION?}-installer-linux-x86_64.sh" \
-  && "./bazel-${BAZEL_VERSION?}-installer-linux-x86_64.sh" --user \
-  && rm "bazel-${BAZEL_VERSION?}-installer-linux-x86_64.sh"
-# Install a newer version of Bazel. We don't need the full installation now.
-# Just need to provide a different version for the version-identification
-# wrapper script to find in /root/.bazel/bin
+ARG BAZEL_VERSION=3.3.1
+# Change to a new version if upgrading Bazel.
 ARG NEW_BAZEL_VERSION=3.3.1
-RUN cd "/root/.bazel/bin" \
-  && wget "https://releases.bazel.build/${NEW_BAZEL_VERSION?}/release/bazel-${NEW_BAZEL_VERSION?}-linux-x86_64" \
-  && chmod +x "bazel-${NEW_BAZEL_VERSION?}-linux-x86_64"
-# ENV does not allow ${variable?} syntax.
-ENV PATH "/root/bin:${PATH}"
+RUN wget -qO - https://bazel.build/bazel-release.pub.gpg | apt-key add - \
+  && echo "deb [arch=amd64] https://storage.googleapis.com/bazel-apt stable jdk1.8" \
+    | tee /etc/apt/sources.list.d/bazel.list \
+  && apt-get update \
+  # Install Bazel pinned at the version we want. Optionally install an
+  # additional version of Bazel to ease upgrades (modify NEW_BAZEL_VERSION
+  # above). Bazel does some shenanigans to select the correct version based on
+  # your .bazelversion file. When upgrading, we therefore need to have both the
+  # old and new version. When the versions are the same this second installation
+  # is effectively a noop.
+  && apt-get install "bazel=${BAZEL_VERSION?}" "bazel-${NEW_BAZEL_VERSION?}"
 
-# Install core IREE dependencies.
-RUN apt-get install -y clang libsdl2-dev
-
-# Install python2 numpy. Temporary fix for issue #1737:
-# https://github.com/google/iree/issues/1737
-RUN apt-get install -y python-pip \
-  && python -m pip install --upgrade pip \
-  && python -m pip install numpy
+# TF requires python2 numpy at configure time...
+# TODO(#1737): Remove this
+RUN apt-get update \
+    && apt-get install -y python-pip \
+    && python -m pip install --upgrade pip \
+    && python -m pip install numpy
diff --git a/build_tools/docker/bazel_bindings/Dockerfile b/build_tools/docker/bazel_bindings/Dockerfile
index f7ef30f..8f07958 100644
--- a/build_tools/docker/bazel_bindings/Dockerfile
+++ b/build_tools/docker/bazel_bindings/Dockerfile
@@ -27,6 +27,11 @@
 FROM gcr.io/iree-oss/bazel
 
 # Install python3 and numpy.
-RUN apt-get install -y python3 python3-dev python3-pip python3-setuptools \
+RUN apt-get update \
+  && apt-get install -y \
+    python3 \
+    python3-dev \
+    python3-pip \
+    python3-setuptools \
   && python3 -m pip install --upgrade pip \
   && python3 -m pip install numpy
diff --git a/build_tools/docker/bazel_nvidia/Dockerfile b/build_tools/docker/bazel_nvidia/Dockerfile
new file mode 100644
index 0000000..86305c4
--- /dev/null
+++ b/build_tools/docker/bazel_nvidia/Dockerfile
@@ -0,0 +1,46 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# An image for building IREE's tensorflow integrations using bazel and vulkan.
+
+# Build using:
+# docker build --tag gcr.io/iree-oss/bazel-nvidia \
+#   build_tools/docker/bazel_nvidia/
+
+# Run interactively using the following, where IREE_WORKDIR is the path to your
+# local dev environment:
+# docker run -it --rm --entrypoint bash --volume ${IREE_WORKDIR}:/usr/src/iree/ \
+#   gcr.io/iree-oss/bazel-nvidia
+
+# Set up the image and working directory.
+# We start from bazel-nvidia so this image can be used to testing TensorFlow
+# integrations.
+FROM gcr.io/iree-oss/bazel-tensorflow
+
+# Additionally, we need to install the Vulkan SDK and the NVIDIA Vulkan driver.
+
+ARG VULKAN_SDK_VERSION=1.2.141
+
+# Disable apt-key parse waring. If someone knows how to do whatever the "proper"
+# thing is then feel free. The warning complains about parsing apt-key output,
+# which we're not even doing.
+ARG APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1
+
+RUN wget -qO - http://packages.lunarg.com/lunarg-signing-key-pub.asc \
+    | apt-key add - \
+  && wget -qO \
+    "/etc/apt/sources.list.d/lunarg-vulkan-${VULKAN_SDK_VERSION?}-bionic.list" \
+    "http://packages.lunarg.com/vulkan/${VULKAN_SDK_VERSION?}/lunarg-vulkan-${VULKAN_SDK_VERSION?}-bionic.list" \
+  && apt-get update \
+  && DEBIAN_FRONTEND=noninteractive apt-get install -y vulkan-sdk nvidia-driver-440
diff --git a/build_tools/docker/bazel_swiftshader/Dockerfile b/build_tools/docker/bazel_swiftshader/Dockerfile
new file mode 100644
index 0000000..2af027d
--- /dev/null
+++ b/build_tools/docker/bazel_swiftshader/Dockerfile
@@ -0,0 +1,92 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# An image for building IREE with tensorflow integrations using bazel and
+# running Vulkan tests on SwiftShader.
+
+# Build using:
+# docker build --tag gcr.io/iree-oss/bazel-swiftshader \
+#   build_tools/docker/bazel_swiftshader/
+
+# Run interactively using the following, where IREE_WORKDIR is the path to your
+# local dev environment:
+# docker run -it --rm --entrypoint bash --volume ${IREE_WORKDIR}:/usr/src/iree/ \
+#   gcr.io/iree-oss/bazel-swiftshader
+
+# Set up the image and working directory.
+FROM gcr.io/iree-oss/bazel-tensorflow
+
+# TODO(#2651): The following steps are copied from cmake, cmake-vulkan, and
+# cmake-swiftshader. We might want to consider using docker multi-stage
+# builds to factor them out.
+
+# Additionally, we need to install the Vulkan SDK.
+
+ARG VULKAN_SDK_VERSION=1.2.141
+
+# Disable apt-key parse waring. If someone knows how to do whatever the "proper"
+# thing is then feel free. The warning complains about parsing apt-key output,
+# which we're not even doing.
+ARG APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1
+
+RUN wget -qO - http://packages.lunarg.com/lunarg-signing-key-pub.asc \
+    | apt-key add - \
+  && wget -qO \
+    "/etc/apt/sources.list.d/lunarg-vulkan-${VULKAN_SDK_VERSION?}-bionic.list" \
+    "http://packages.lunarg.com/vulkan/${VULKAN_SDK_VERSION?}/lunarg-vulkan-${VULKAN_SDK_VERSION?}-bionic.list" \
+  && apt-get update \
+  && apt-get install -y vulkan-sdk
+
+ARG SWIFTSHADER_COMMIT=6a8a74986c357b0c6fa0dfd2b4b9230af8d39d1a
+
+# Then compile and install SwiftShader.
+
+# cmake, ninja, and zlib is needed for compiling SwiftShader.
+RUN apt-get update && apt-get install -y cmake ninja-build zlib1g-dev
+
+# Update cmake to v3.13+, which is ahead of apt-get's version (3.10.2).
+ENV CMAKE_VERSION 3.13.5
+RUN apt-get update \
+  && mkdir ./cmake_install \
+  && cd cmake_install \
+  && wget "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION?}/cmake-${CMAKE_VERSION?}.tar.gz" \
+  && tar -xzvf "cmake-${CMAKE_VERSION?}.tar.gz" \
+  && cd "cmake-${CMAKE_VERSION?}/" \
+  && cmake . \
+  && make \
+  && make install
+
+RUN git clone https://github.com/google/swiftshader \
+  && cd swiftshader && git checkout "${SWIFTSHADER_COMMIT?}" && cd .. \
+  # Only build SwiftShader Vulkan ICD.
+  && cmake -S swiftshader/ -B build-swiftshader/ \
+           -GNinja \
+           -DSWIFTSHADER_BUILD_VULKAN=ON \
+           -DSWIFTSHADER_BUILD_EGL=OFF \
+           -DSWIFTSHADER_BUILD_GLESv2=OFF \
+           -DSWIFTSHADER_BUILD_GLES_CM=OFF \
+           -DSWIFTSHADER_BUILD_PVR=OFF \
+           -DSWIFTSHADER_BUILD_TESTS=OFF \
+  && cmake --build build-swiftshader/ \
+           --config Release \
+           --target vk_swiftshader \
+  # Copy the ICD JSON and .so to a known place.
+  && cp -rf build-swiftshader/Linux /swiftshader \
+  # Keep track of the commit we are using.
+  && echo "${SWIFTSHADER_COMMIT?}" > /swiftshader/git-commit \
+  # Clean up everything.
+  && rm -rf swiftshader build-swiftshader
+
+# Set VK_ICD_FILENAMES so Vulkan loader can find the SwiftShader ICD.
+ENV VK_ICD_FILENAMES /swiftshader/vk_swiftshader_icd.json
diff --git a/build_tools/docker/bazel_tensorflow/Dockerfile b/build_tools/docker/bazel_tensorflow/Dockerfile
index a117990..0c37354 100644
--- a/build_tools/docker/bazel_tensorflow/Dockerfile
+++ b/build_tools/docker/bazel_tensorflow/Dockerfile
@@ -24,9 +24,7 @@
 #   gcr.io/iree-oss/bazel-tensorflow
 
 # Set up the image and working directory.
-FROM gcr.io/iree-oss/bazel
+FROM gcr.io/iree-oss/bazel-bindings
 
-# Install python3, tensorflow and numpy.
-RUN apt-get install -y python3 python3-dev python3-pip python3-setuptools \
-  && python3 -m pip install --upgrade pip \
-  && python3 -m pip install numpy tf-nightly
+# Install tensorflow.
+RUN python3 -m pip install tf-nightly
diff --git a/build_tools/docker/build_and_update_gcr.py b/build_tools/docker/build_and_update_gcr.py
index 90c7454..6de59a4 100755
--- a/build_tools/docker/build_and_update_gcr.py
+++ b/build_tools/docker/build_and_update_gcr.py
@@ -20,30 +20,37 @@
 """
 
 import argparse
+import functools
 import os
 import subprocess
+import sys
 
 IREE_GCR_URL = 'gcr.io/iree-oss/'
 DOCKER_DIR = 'build_tools/docker/'
 
-IMAGES = [
-    'bazel',
-    'bazel-bindings',
-    'bazel-tensorflow',
-    'cmake',
-    'cmake-android',
-    'cmake-nvidia',
-    'rbe-toolchain',
-]
-IMAGES_HELP = [f'`{name}`' for name in IMAGES]
-IMAGES_HELP = f'{", ".join(IMAGES_HELP[:-1])} or {IMAGES_HELP[-1]}'
-
-# Map from image names to images that depend on them.
-IMAGES_TO_DEPENDENT_IMAGES = {
-    'bazel': ['bazel-bindings', 'bazel-tensorflow'],
-    'cmake': ['cmake-android', 'cmake-nvidia'],
+# Map from image names to images that they depend on.
+IMAGES_TO_DEPENDENCIES = {
+    'bazel': [],
+    'bazel-bindings': ['bazel'],
+    'bazel-tensorflow': ['bazel-bindings'],
+    'bazel-nvidia': ['bazel-tensorflow'],
+    'bazel-swiftshader': ['bazel-tensorflow'],
+    'cmake': [],
+    'cmake-android': ['cmake'],
+    'cmake-nvidia': ['cmake'],
+    'cmake-vulkan': ['cmake'],
+    'cmake-swiftshader': ['cmake-vulkan'],
+    'rbe-toolchain': [],
 }
 
+IMAGES_TO_DEPENDENT_IMAGES = {k: [] for k in IMAGES_TO_DEPENDENCIES.keys()}
+for image, dependencies in IMAGES_TO_DEPENDENCIES.items():
+  for dependency in dependencies:
+    IMAGES_TO_DEPENDENT_IMAGES[dependency].append(image)
+
+IMAGES_HELP = [f'`{name}`' for name in IMAGES_TO_DEPENDENCIES.keys()]
+IMAGES_HELP = f'{", ".join(IMAGES_HELP)} or `all`'
+
 RBE_MESSAGE = """
 Remember to update the `rbe_default` digest in the `WORKSPACE` file to reflect
 the new digest for the container.
@@ -57,8 +64,10 @@
       description="Build IREE's Docker images and optionally push them to GCR.")
   parser.add_argument(
       '--image',
+      dest='images',
       type=str,
       required=True,
+      action='append',
       help=f'Name of the image to build: {IMAGES_HELP}.')
   parser.add_argument(
       '--tag',
@@ -73,34 +82,77 @@
       help='Push the built images to GCR. Requires gcloud authorization.')
 
   args = parser.parse_args()
-  if args.image not in IMAGES:
-    raise parser.error('Expected --image to be one of:\n'
-                       f'  {IMAGES_HELP}\n'
-                       f'but got `{args.image}`.')
-
+  for image in args.images:
+    if image == 'all':
+      args.images = IMAGES_TO_DEPENDENCIES.keys()
+    elif image not in IMAGES_TO_DEPENDENCIES.keys():
+      raise parser.error('Expected --image to be one of:\n'
+                         f'  {IMAGES_HELP}\n'
+                         f'but got `{image}`.')
   return args
 
 
+def cmp_images_by_dependency(image1, image2):
+  if image2 in IMAGES_TO_DEPENDENT_IMAGES[image1]:
+    return -1
+  if image1 in IMAGES_TO_DEPENDENT_IMAGES[image2]:
+    return 1
+  return (image1 > image2) - (image1 < image2)
+
+
+def run_command(command):
+  print(f'Running: {" ".join(command)}')
+  process = subprocess.Popen(
+      command,
+      bufsize=1,
+      stderr=subprocess.STDOUT,
+      stdout=subprocess.PIPE,
+      text=True)
+  for line in process.stdout:
+    print(line, end='')
+
+  return process.poll()
+
+
+def check_command(command):
+  exit_code = run_command(command)
+  if exit_code != 0:
+    print(f'Command failed: {" ".join(command)}')
+    sys.exit(exit_code)
+
+
 if __name__ == '__main__':
   args = parse_arguments()
 
   # Ensure the user has the correct authorization if they try to push to GCR.
   if args.push:
-    subprocess.check_output(['gcloud', 'auth', 'configure-docker'])
+    if run_command(['which', 'gcloud']) != 0:
+      print('gcloud not found.'
+            ' See https://cloud.google.com/sdk/install for installation.')
+      sys.exit(1)
+    check_command(['gcloud', 'auth', 'configure-docker'])
 
-  # Check if any images depend on `args.image` and update them if they do.
-  images_to_update = [args.image]
-  if args.image in IMAGES_TO_DEPENDENT_IMAGES:
-    images_to_update.extend(IMAGES_TO_DEPENDENT_IMAGES[args.image])
+  # Check if any images depend on `args.images` and update them if they do.
+  images_to_update_set = set()
+  to_check = list(args.images)
+  while to_check:
+    image = to_check.pop()
+    if image not in images_to_update_set:
+      images_to_update_set.add(image)
+      to_check.extend(IMAGES_TO_DEPENDENT_IMAGES[image])
 
+  # Topo sort by image dependency
+  images_to_update = sorted(
+      images_to_update_set, key=functools.cmp_to_key(cmp_images_by_dependency))
+
+  print(f'Also updating dependent images. Will update: {images_to_update}')
   for image in images_to_update:
     print(f'Updating image {image}')
     image_url = os.path.join(IREE_GCR_URL, f'{image}:{args.tag}')
     image_path = os.path.join(DOCKER_DIR, image.replace('-', '_'))
-    subprocess.check_output(
-        ['docker', 'build', '--tag', image_url, image_path])
+    check_command(['docker', 'build', '--tag', image_url, image_path])
     if args.push:
-      subprocess.check_output(['docker', 'push', image_url])
+      check_command(['docker', 'push', image_url])
 
   if 'rbe-toolchain' in images_to_update:
     print(RBE_MESSAGE)
diff --git a/build_tools/docker/cmake/Dockerfile b/build_tools/docker/cmake/Dockerfile
index 92cece8..bde1f4d 100644
--- a/build_tools/docker/cmake/Dockerfile
+++ b/build_tools/docker/cmake/Dockerfile
@@ -27,14 +27,25 @@
 FROM ubuntu:18.04
 WORKDIR /usr/src/iree/
 
-RUN apt-get update
-# TODO: Remove this if the `apt-get install` below works without it again.
-RUN apt update
+RUN apt-get update \
+  && apt-get install -y \
+    # git for updating IREE's submodules.
+    git \
+    # For later installations
+    wget \
+    # For building with ninja
+    ninja-build \
+    # For bootstrapping the cmake installation
+    cmake \
+    # core IREE dependencies.
+    clang \
+    libsdl2-dev \
+    libssl-dev
 
 # Update cmake to v3.13+, which is ahead of apt-get's version (3.10.2).
 # Install dependencies, including an old version of cmake to bootstrap.
 ENV CMAKE_VERSION 3.13.5
-RUN apt-get install -y clang cmake libssl-dev wget \
+RUN apt-get update \
   && mkdir ./cmake_install \
   && cd cmake_install \
   && wget "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION?}/cmake-${CMAKE_VERSION?}.tar.gz" \
@@ -44,14 +55,12 @@
   && make \
   && make install
 
-# Install dependencies.
-RUN apt-get install -y \
-  git \
-  ninja-build \
-  python3 \
-  python3-pip \
-  python3-setuptools \
-  # Install dependencies for the python bindings tests.
+# Dependencies for the python bindings tests.
+RUN apt-get update \
+  && apt-get install -y \
+    python3 \
+    python3-pip \
+    python3-setuptools \
   && python3 -m pip install --upgrade pip \
   && python3 -m pip install numpy absl-py
 
diff --git a/build_tools/docker/cmake_nvidia/Dockerfile b/build_tools/docker/cmake_nvidia/Dockerfile
index 8679656..1fdeaee 100644
--- a/build_tools/docker/cmake_nvidia/Dockerfile
+++ b/build_tools/docker/cmake_nvidia/Dockerfile
@@ -34,19 +34,22 @@
 #      does not support Ubuntu 18.04.
 # This allows to share configuration with base CMake, but it also means we need
 # to MATCH the driver version between the host machine and the docker image.
+# TODO: use cmake-vulkan as the base.
 FROM gcr.io/iree-oss/cmake
 
 # Additionally, we need to install the Vulkan SDK and the NVIDIA Vulkan driver.
 
 ARG VULKAN_SDK_VERSION=1.2.141
 
-# Disable apt-key parse waring.
-ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1
+# Disable apt-key parse waring. If someone knows how to do whatever the "proper"
+# thing is then feel free. The warning complains about parsing apt-key output,
+# which we're not even doing.
+ARG APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1
 
-# Disable prompt during keyboard configuration.
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN wget -qO - http://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - \
-  && wget -qO /etc/apt/sources.list.d/lunarg-vulkan-$VULKAN_SDK_VERSION-bionic.list http://packages.lunarg.com/vulkan/$VULKAN_SDK_VERSION/lunarg-vulkan-$VULKAN_SDK_VERSION-bionic.list \
+RUN wget -qO - http://packages.lunarg.com/lunarg-signing-key-pub.asc \
+    | apt-key add - \
+  && wget -qO \
+    "/etc/apt/sources.list.d/lunarg-vulkan-${VULKAN_SDK_VERSION?}-bionic.list" \
+    "http://packages.lunarg.com/vulkan/${VULKAN_SDK_VERSION?}/lunarg-vulkan-${VULKAN_SDK_VERSION?}-bionic.list" \
   && apt-get update \
   && apt-get install -y vulkan-sdk nvidia-driver-440
diff --git a/build_tools/docker/cmake_swiftshader/Dockerfile b/build_tools/docker/cmake_swiftshader/Dockerfile
new file mode 100644
index 0000000..b92ce02
--- /dev/null
+++ b/build_tools/docker/cmake_swiftshader/Dockerfile
@@ -0,0 +1,60 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# An image for building IREE using CMake and testing IREE with SwiftShader
+# Vulkan implementation.
+
+# Build using:
+# docker build --tag gcr.io/iree-oss/cmake-swiftshader \
+#   build_tools/docker/cmake_swiftshader/
+
+# Run interactively using the following, where IREE_WORKDIR is the path to your
+# local dev environment:
+# docker run -it --rm --entrypoint bash \
+#   --volume "${IREE_WORKDIR?}:/usr/src/iree/" \
+#   --gpus all \
+#   gcr.io/iree-oss/cmake-swiftshader
+
+# Set up the image and working directory by inheriting the base
+# CMake configuration.
+FROM gcr.io/iree-oss/cmake-vulkan
+
+ARG SWIFTSHADER_COMMIT=6a8a74986c357b0c6fa0dfd2b4b9230af8d39d1a
+
+# zlib is needed for compiling SwiftShader.
+RUN apt-get update && apt-get install zlib1g-dev
+
+RUN git clone https://github.com/google/swiftshader \
+  && cd swiftshader && git checkout "${SWIFTSHADER_COMMIT?}" && cd .. \
+  # Only build SwiftShader Vulkan ICD.
+  && cmake -S swiftshader/ -B build-swiftshader/ \
+           -GNinja \
+           -DSWIFTSHADER_BUILD_VULKAN=ON \
+           -DSWIFTSHADER_BUILD_EGL=OFF \
+           -DSWIFTSHADER_BUILD_GLESv2=OFF \
+           -DSWIFTSHADER_BUILD_GLES_CM=OFF \
+           -DSWIFTSHADER_BUILD_PVR=OFF \
+           -DSWIFTSHADER_BUILD_TESTS=OFF \
+  && cmake --build build-swiftshader/ \
+           --config Release \
+           --target vk_swiftshader \
+  # Copy the ICD JSON and .so to a known place.
+  && cp -rf build-swiftshader/Linux /swiftshader \
+  # Keep track of the commit we are using.
+  && echo "${SWIFTSHADER_COMMIT?}" > /swiftshader/git-commit \
+  # Clean up everything.
+  && rm -rf swiftshader build-swiftshader
+
+# Set VK_ICD_FILENAMES so Vulkan loader can find the SwiftShader ICD.
+ENV VK_ICD_FILENAMES /swiftshader/vk_swiftshader_icd.json
diff --git a/build_tools/docker/cmake_vulkan/Dockerfile b/build_tools/docker/cmake_vulkan/Dockerfile
new file mode 100644
index 0000000..f2cdee6
--- /dev/null
+++ b/build_tools/docker/cmake_vulkan/Dockerfile
@@ -0,0 +1,47 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# An image with Vulkan SDK for building IREE using CMake.
+
+# Build using:
+# docker build --tag gcr.io/iree-oss/cmake-vulkan \
+#   build_tools/docker/cmake_vulkan/
+
+# Run interactively using the following, where IREE_WORKDIR is the path to your
+# local dev environment:
+# docker run -it --rm --entrypoint bash \
+#   --volume "${IREE_WORKDIR?}:/usr/src/iree/" \
+#   --gpus all \
+#   gcr.io/iree-oss/cmake-vulkan
+
+# Set up the image and working directory by inheriting the base
+# CMake configuration.
+FROM gcr.io/iree-oss/cmake
+
+# Additionally, we need to install the Vulkan SDK.
+
+ARG VULKAN_SDK_VERSION=1.2.141
+
+# Disable apt-key parse waring. If someone knows how to do whatever the "proper"
+# thing is then feel free. The warning complains about parsing apt-key output,
+# which we're not even doing.
+ARG APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1
+
+RUN wget -qO - http://packages.lunarg.com/lunarg-signing-key-pub.asc \
+    | apt-key add - \
+  && wget -qO \
+    "/etc/apt/sources.list.d/lunarg-vulkan-${VULKAN_SDK_VERSION?}-bionic.list" \
+    "http://packages.lunarg.com/vulkan/${VULKAN_SDK_VERSION?}/lunarg-vulkan-${VULKAN_SDK_VERSION?}-bionic.list" \
+  && apt-get update \
+  && apt-get install -y vulkan-sdk
diff --git a/build_tools/docker/rbe_toolchain/Dockerfile b/build_tools/docker/rbe_toolchain/Dockerfile
index c2df299..878aca3 100755
--- a/build_tools/docker/rbe_toolchain/Dockerfile
+++ b/build_tools/docker/rbe_toolchain/Dockerfile
@@ -21,23 +21,29 @@
 
 FROM gcr.io/cloud-marketplace/google/rbe-ubuntu16-04@sha256:1a8ed713f40267bb51fe17de012fa631a20c52df818ccb317aaed2ee068dfc61
 
-RUN apt-get update
-RUN apt-get install -y python3 python3-pip
-RUN python3 -m pip install --upgrade pip
-RUN python3 -m pip install numpy
+RUN apt-get update \
+    && apt-get install -y \
+        python3 \
+        python3-pip \
+    && python3 -m pip install --upgrade pip \
+    && python3 -m pip install numpy
 
-# Install dependencies for python3.6-dev
-RUN apt-get install -y software-properties-common
+# Dependency for python3.6-dev. Needs to be installed separately from the above
+# for... some reason
+RUN apt-get update && apt-get install -y software-properties-common
+
 # apt-add-repository requires a version of python with the softwareproperties
 # module. To use this command, we:
 #   1. remove the symlink to python3 from python3.6 and symlink it to python3.5
 #   2. run apt-add-repository with python3 = python3.5
 #   3. resymlink python3 to /opt/python3.6/bin/python3.6
 # See https://github.com/google/iree/issues/1966 for more information.
-RUN rm /usr/bin/python3 && ln -s /usr/bin/python3.5 /usr/bin/python3
-RUN add-apt-repository ppa:deadsnakes/ppa
-RUN rm /usr/bin/python3 && ln -s /opt/python3.6/bin/python3.6 /usr/bin/python3
+RUN rm /usr/bin/python3 \
+    && ln -s /usr/bin/python3.5 /usr/bin/python3 \
+    && add-apt-repository ppa:deadsnakes/ppa \
+    && rm /usr/bin/python3 \
+    && ln -s /opt/python3.6/bin/python3.6 /usr/bin/python3
 
 # Install python3.6-dev
-RUN apt-get update
-RUN apt-get install -y python3.6 python3.6-dev
+RUN apt-get update \
+  && apt-get install -y python3.6 python3.6-dev
diff --git a/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-swiftshader/integrations/build.sh b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-swiftshader/integrations/build.sh
new file mode 100755
index 0000000..32c061f
--- /dev/null
+++ b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-swiftshader/integrations/build.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# For use within a IREE bazel-swiftshader docker image on a Kokoro VM.
+# Log some information about the environment, initialize the submodules and then
+# run the bazel integrations tests.
+
+set -e
+set -x
+
+# Print the UTC time when set -x is on
+export PS4='[$(date -u "+%T %Z")] '
+
+# Check these exist and print the versions for later debugging
+bazel --version
+"$CXX" --version
+"$CC" --version
+"$PYTHON_BIN" -V
+# TODO(#1875): Make PYTHON_BIN also control the runtime version
+python3 -V
+
+# Print Vulkan related information: SDK version and GPU ICD version
+vulkaninfo 2>/dev/null | grep "Vulkan Instance" || echo "Vulkan Instance not found!"
+vulkaninfo 2>/dev/null | grep -A7 "VkPhysicalDeviceProperties"  || echo "VkPhysicalDeviceProperties not found!"
+
+# Print SwiftShader git commit
+cat /swiftshader/git-commit
+
+echo "Initializing submodules"
+./scripts/git/submodule_versions.py init
+
+echo "Building and testing with bazel"
+./build_tools/bazel/build_tensorflow.sh
diff --git a/kokoro/gcp_ubuntu/bazel/bindings/build_kokoro.sh b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-swiftshader/integrations/build_kokoro.sh
old mode 100644
new mode 100755
similarity index 62%
copy from kokoro/gcp_ubuntu/bazel/bindings/build_kokoro.sh
copy to build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-swiftshader/integrations/build_kokoro.sh
index e2a9bd3..63c5893
--- a/kokoro/gcp_ubuntu/bazel/bindings/build_kokoro.sh
+++ b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-swiftshader/integrations/build_kokoro.sh
@@ -14,30 +14,31 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Build and test IREE's bindings within the gcr.io/iree-oss/bazel-bindings
+# Build and test IREE's integrations within gcr.io/iree-oss/bazel-swiftshader
 # image using Kokoro.
+# Requires the environment variables KOKORO_ROOT and KOKORO_ARTIFACTS_DIR, which
+# are set by Kokoro.
 
-set -e
 set -x
+set -e
+set -o pipefail
 
 # Print the UTC time when set -x is on
 export PS4='[$(date -u "+%T %Z")] '
 
-# Kokoro checks out the repository here.
-WORKDIR="${KOKORO_ARTIFACTS_DIR?}/github/iree"
+source "${KOKORO_ARTIFACTS_DIR?}/github/iree/build_tools/kokoro/gcp_ubuntu/docker_common.sh"
 
-# Mount the checked out repository, make that the working directory and run the
-# tests in the bazel-bindings image.
-docker run \
-  --volume "${WORKDIR?}:${WORKDIR?}" \
-  --workdir="${WORKDIR?}" \
-  --rm \
-  gcr.io/iree-oss/bazel-bindings:prod \
-  kokoro/gcp_ubuntu/bazel/bindings/build.sh
+# Sets DOCKER_RUN_ARGS
+docker_setup
+
+docker run "${DOCKER_RUN_ARGS[@]?}" \
+  --env IREE_VULKAN_DISABLE=0 \
+  gcr.io/iree-oss/bazel-swiftshader:prod \
+  build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-swiftshader/integrations/build.sh
 
 # Kokoro will rsync this entire directory back to the executor orchestrating the
 # build which takes forever and is totally useless.
-sudo rm -rf "${KOKORO_ARTIFACTS_DIR?}"/*
+rm -rf "${KOKORO_ARTIFACTS_DIR?}"/*
 
 # Print out artifacts dir contents after deleting them as a coherence check.
 ls -1a "${KOKORO_ARTIFACTS_DIR?}/"
diff --git a/kokoro/gcp_ubuntu/cmake/continuous.cfg b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-swiftshader/integrations/common.cfg
similarity index 72%
copy from kokoro/gcp_ubuntu/cmake/continuous.cfg
copy to build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-swiftshader/integrations/common.cfg
index e4cc270..609936e 100644
--- a/kokoro/gcp_ubuntu/cmake/continuous.cfg
+++ b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-swiftshader/integrations/common.cfg
@@ -1,6 +1,6 @@
 # Format: //devtools/kokoro/config/proto/build.proto
 
-# Copyright 2020 Google LLC
+# Copyright 2019 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,6 +14,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Deliberately blank as everything necessary is configured in common files, but
-# file must still exist to match corresponding (Google internal) job
-# configurations that trigger the builds.
+build_file: "iree/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-swiftshader/integrations/build_kokoro.sh"
diff --git a/kokoro/gcp_ubuntu/bazel/integrations/continuous.cfg b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-swiftshader/integrations/google.cfg
similarity index 100%
copy from kokoro/gcp_ubuntu/bazel/integrations/continuous.cfg
copy to build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-swiftshader/integrations/google.cfg
diff --git a/kokoro/gcp_ubuntu/bazel/integrations/continuous.cfg b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-swiftshader/integrations/main.cfg
similarity index 100%
copy from kokoro/gcp_ubuntu/bazel/integrations/continuous.cfg
copy to build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-swiftshader/integrations/main.cfg
diff --git a/kokoro/gcp_ubuntu/bazel/integrations/continuous.cfg b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-swiftshader/integrations/presubmit.cfg
similarity index 100%
copy from kokoro/gcp_ubuntu/bazel/integrations/continuous.cfg
copy to build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-swiftshader/integrations/presubmit.cfg
diff --git a/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-turing/integrations/build.sh b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-turing/integrations/build.sh
new file mode 100755
index 0000000..51b491d
--- /dev/null
+++ b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-turing/integrations/build.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# For use within a IREE bazel-nvidia docker image on a Kokoro VM.
+# Log some information about the environment, initialize the submodules and then
+# run the bazel integrations tests.
+
+set -e
+set -x
+
+# Print the UTC time when set -x is on
+export PS4='[$(date -u "+%T %Z")] '
+
+# Check these exist and print the versions for later debugging
+bazel --version
+"$CXX" --version
+"$CC" --version
+"$PYTHON_BIN" -V
+# TODO(#1875): Make PYTHON_BIN also control the runtime version
+python3 -V
+
+# Print Vulkan related information: SDK version and GPU ICD version
+vulkaninfo 2>/dev/null | grep "Vulkan Instance" || echo "Vulkan Instance not found!"
+vulkaninfo 2>/dev/null | grep -A7 "VkPhysicalDeviceProperties"  || echo "VkPhysicalDeviceProperties not found!"
+
+echo "Initializing submodules"
+./scripts/git/submodule_versions.py init
+
+echo "Building and testing with bazel"
+./build_tools/bazel/build_tensorflow.sh
diff --git a/kokoro/gcp_ubuntu/bazel/integrations/build_kokoro.sh b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-turing/integrations/build_kokoro.sh
old mode 100644
new mode 100755
similarity index 88%
rename from kokoro/gcp_ubuntu/bazel/integrations/build_kokoro.sh
rename to build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-turing/integrations/build_kokoro.sh
index 1716266..0712a66
--- a/kokoro/gcp_ubuntu/bazel/integrations/build_kokoro.sh
+++ b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-turing/integrations/build_kokoro.sh
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Build and test IREE's integrations within the gcr.io/iree-oss/bazel-tensorflow
+# Build and test IREE's integrations within the gcr.io/iree-oss/bazel-nvidia
 # image using Kokoro.
 
 set -e
@@ -32,8 +32,10 @@
   --volume "${WORKDIR?}:${WORKDIR?}" \
   --workdir="${WORKDIR?}" \
   --rm \
-  gcr.io/iree-oss/bazel-tensorflow:prod \
-  kokoro/gcp_ubuntu/bazel/integrations/build.sh
+  --env IREE_VULKAN_DISABLE=0 \
+  --gpus all \
+  gcr.io/iree-oss/bazel-nvidia:prod \
+  build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-turing/integrations/build.sh
 
 # Kokoro will rsync this entire directory back to the executor orchestrating the
 # build which takes forever and is totally useless.
diff --git a/kokoro/gcp_ubuntu/cmake/continuous.cfg b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-turing/integrations/common.cfg
similarity index 72%
copy from kokoro/gcp_ubuntu/cmake/continuous.cfg
copy to build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-turing/integrations/common.cfg
index e4cc270..eb09ca3 100644
--- a/kokoro/gcp_ubuntu/cmake/continuous.cfg
+++ b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-turing/integrations/common.cfg
@@ -1,6 +1,6 @@
 # Format: //devtools/kokoro/config/proto/build.proto
 
-# Copyright 2020 Google LLC
+# Copyright 2019 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,6 +14,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Deliberately blank as everything necessary is configured in common files, but
-# file must still exist to match corresponding (Google internal) job
-# configurations that trigger the builds.
+build_file: "iree/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-turing/integrations/build_kokoro.sh"
diff --git a/kokoro/gcp_ubuntu/bazel/integrations/continuous.cfg b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-turing/integrations/google.cfg
similarity index 100%
rename from kokoro/gcp_ubuntu/bazel/integrations/continuous.cfg
rename to build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-turing/integrations/google.cfg
diff --git a/kokoro/gcp_ubuntu/bazel/integrations/continuous.cfg b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-turing/integrations/main.cfg
similarity index 100%
copy from kokoro/gcp_ubuntu/bazel/integrations/continuous.cfg
copy to build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-turing/integrations/main.cfg
diff --git a/kokoro/gcp_ubuntu/bazel/integrations/continuous.cfg b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-turing/integrations/presubmit.cfg
similarity index 100%
copy from kokoro/gcp_ubuntu/bazel/integrations/continuous.cfg
copy to build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-turing/integrations/presubmit.cfg
diff --git a/kokoro/gcp_ubuntu/bazel/bindings/build.sh b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/bindings/build.sh
similarity index 100%
rename from kokoro/gcp_ubuntu/bazel/bindings/build.sh
rename to build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/bindings/build.sh
diff --git a/kokoro/gcp_ubuntu/bazel/bindings/build_kokoro.sh b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/bindings/build_kokoro.sh
old mode 100644
new mode 100755
similarity index 73%
rename from kokoro/gcp_ubuntu/bazel/bindings/build_kokoro.sh
rename to build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/bindings/build_kokoro.sh
index e2a9bd3..24cbabf
--- a/kokoro/gcp_ubuntu/bazel/bindings/build_kokoro.sh
+++ b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/bindings/build_kokoro.sh
@@ -16,28 +16,28 @@
 
 # Build and test IREE's bindings within the gcr.io/iree-oss/bazel-bindings
 # image using Kokoro.
+# Requires the environment variables KOKORO_ROOT and KOKORO_ARTIFACTS_DIR, which
+# are set by Kokoro.
 
-set -e
 set -x
+set -e
+set -o pipefail
 
 # Print the UTC time when set -x is on
 export PS4='[$(date -u "+%T %Z")] '
 
-# Kokoro checks out the repository here.
-WORKDIR="${KOKORO_ARTIFACTS_DIR?}/github/iree"
+source "${KOKORO_ARTIFACTS_DIR?}/github/iree/build_tools/kokoro/gcp_ubuntu/docker_common.sh"
 
-# Mount the checked out repository, make that the working directory and run the
-# tests in the bazel-bindings image.
-docker run \
-  --volume "${WORKDIR?}:${WORKDIR?}" \
-  --workdir="${WORKDIR?}" \
-  --rm \
+# Sets DOCKER_RUN_ARGS
+docker_setup
+
+docker run "${DOCKER_RUN_ARGS[@]?}" \
   gcr.io/iree-oss/bazel-bindings:prod \
-  kokoro/gcp_ubuntu/bazel/bindings/build.sh
+  build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/bindings/build.sh
 
 # Kokoro will rsync this entire directory back to the executor orchestrating the
 # build which takes forever and is totally useless.
-sudo rm -rf "${KOKORO_ARTIFACTS_DIR?}"/*
+rm -rf "${KOKORO_ARTIFACTS_DIR?}"/*
 
 # Print out artifacts dir contents after deleting them as a coherence check.
 ls -1a "${KOKORO_ARTIFACTS_DIR?}/"
diff --git a/kokoro/gcp_ubuntu/cmake/continuous.cfg b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/bindings/common.cfg
similarity index 72%
copy from kokoro/gcp_ubuntu/cmake/continuous.cfg
copy to build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/bindings/common.cfg
index e4cc270..8a49430 100644
--- a/kokoro/gcp_ubuntu/cmake/continuous.cfg
+++ b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/bindings/common.cfg
@@ -1,6 +1,6 @@
 # Format: //devtools/kokoro/config/proto/build.proto
 
-# Copyright 2020 Google LLC
+# Copyright 2019 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Deliberately blank as everything necessary is configured in common files, but
-# file must still exist to match corresponding (Google internal) job
-# configurations that trigger the builds.
+# Common configuration for Kokoro builds that run the bindings build with bazel
+# on linux.
+
+build_file: "iree/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/bindings/build_kokoro.sh"
diff --git a/kokoro/gcp_ubuntu/bazel/bindings/continuous.cfg b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/bindings/google.cfg
similarity index 100%
rename from kokoro/gcp_ubuntu/bazel/bindings/continuous.cfg
rename to build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/bindings/google.cfg
diff --git a/kokoro/gcp_ubuntu/bazel/bindings/continuous.cfg b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/bindings/main.cfg
similarity index 100%
copy from kokoro/gcp_ubuntu/bazel/bindings/continuous.cfg
copy to build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/bindings/main.cfg
diff --git a/kokoro/gcp_ubuntu/bazel/bindings/continuous.cfg b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/bindings/presubmit.cfg
similarity index 100%
copy from kokoro/gcp_ubuntu/bazel/bindings/continuous.cfg
copy to build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/bindings/presubmit.cfg
diff --git a/kokoro/gcp_ubuntu/bazel/core/build.sh b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/core/build.sh
similarity index 100%
rename from kokoro/gcp_ubuntu/bazel/core/build.sh
rename to build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/core/build.sh
diff --git a/kokoro/gcp_ubuntu/bazel/core/build_kokoro.sh b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/core/build_kokoro.sh
old mode 100644
new mode 100755
similarity index 72%
rename from kokoro/gcp_ubuntu/bazel/core/build_kokoro.sh
rename to build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/core/build_kokoro.sh
index 031eab8..da07cdd
--- a/kokoro/gcp_ubuntu/bazel/core/build_kokoro.sh
+++ b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/core/build_kokoro.sh
@@ -16,28 +16,28 @@
 
 # Build and test IREE's core within the gcr.io/iree-oss/bazel image using
 # Kokoro.
+# Requires the environment variables KOKORO_ROOT and KOKORO_ARTIFACTS_DIR, which
+# are set by Kokoro.
 
-set -e
 set -x
+set -e
+set -o pipefail
 
 # Print the UTC time when set -x is on
 export PS4='[$(date -u "+%T %Z")] '
 
-# Kokoro checks out the repository here.
-WORKDIR="${KOKORO_ARTIFACTS_DIR?}/github/iree"
+source "${KOKORO_ARTIFACTS_DIR?}/github/iree/build_tools/kokoro/gcp_ubuntu/docker_common.sh"
 
-# Mount the checked out repository, make that the working directory and run the
-# tests in the bazel image.
-docker run \
-  --volume "${WORKDIR?}:${WORKDIR?}" \
-  --workdir="${WORKDIR?}" \
-  --rm \
+# Sets DOCKER_RUN_ARGS
+docker_setup
+
+docker run "${DOCKER_RUN_ARGS[@]?}" \
   gcr.io/iree-oss/bazel:prod \
-  kokoro/gcp_ubuntu/bazel/core/build.sh
+  build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/core/build.sh
 
 # Kokoro will rsync this entire directory back to the executor orchestrating the
 # build which takes forever and is totally useless.
-sudo rm -rf "${KOKORO_ARTIFACTS_DIR?}"/*
+rm -rf "${KOKORO_ARTIFACTS_DIR?}"/*
 
 # Print out artifacts dir contents after deleting them as a coherence check.
 ls -1a "${KOKORO_ARTIFACTS_DIR?}/"
diff --git a/kokoro/gcp_ubuntu/cmake/continuous.cfg b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/core/common.cfg
old mode 100644
new mode 100755
similarity index 72%
copy from kokoro/gcp_ubuntu/cmake/continuous.cfg
copy to build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/core/common.cfg
index e4cc270..3a22d10
--- a/kokoro/gcp_ubuntu/cmake/continuous.cfg
+++ b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/core/common.cfg
@@ -1,6 +1,6 @@
 # Format: //devtools/kokoro/config/proto/build.proto
 
-# Copyright 2020 Google LLC
+# Copyright 2019 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Deliberately blank as everything necessary is configured in common files, but
-# file must still exist to match corresponding (Google internal) job
-# configurations that trigger the builds.
+# Common configuration for Kokoro builds that run the core build with bazel on
+# linux.
+
+build_file: "iree/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/core/build_kokoro.sh"
diff --git a/kokoro/gcp_ubuntu/bazel/core/continuous.cfg b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/core/google.cfg
similarity index 100%
rename from kokoro/gcp_ubuntu/bazel/core/continuous.cfg
rename to build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/core/google.cfg
diff --git a/kokoro/gcp_ubuntu/bazel/core/continuous.cfg b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/core/main.cfg
similarity index 100%
copy from kokoro/gcp_ubuntu/bazel/core/continuous.cfg
copy to build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/core/main.cfg
diff --git a/kokoro/gcp_ubuntu/bazel/core/continuous.cfg b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/core/presubmit.cfg
similarity index 100%
copy from kokoro/gcp_ubuntu/bazel/core/continuous.cfg
copy to build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/core/presubmit.cfg
diff --git a/kokoro/gcp_ubuntu/bazel/integrations/build.sh b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/integrations/build.sh
similarity index 100%
rename from kokoro/gcp_ubuntu/bazel/integrations/build.sh
rename to build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/integrations/build.sh
diff --git a/kokoro/gcp_ubuntu/bazel/integrations/build_kokoro.sh b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/integrations/build_kokoro.sh
old mode 100644
new mode 100755
similarity index 73%
copy from kokoro/gcp_ubuntu/bazel/integrations/build_kokoro.sh
copy to build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/integrations/build_kokoro.sh
index 1716266..c35d897
--- a/kokoro/gcp_ubuntu/bazel/integrations/build_kokoro.sh
+++ b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/integrations/build_kokoro.sh
@@ -16,28 +16,28 @@
 
 # Build and test IREE's integrations within the gcr.io/iree-oss/bazel-tensorflow
 # image using Kokoro.
+# Requires the environment variables KOKORO_ROOT and KOKORO_ARTIFACTS_DIR, which
+# are set by Kokoro.
 
-set -e
 set -x
+set -e
+set -o pipefail
 
 # Print the UTC time when set -x is on
 export PS4='[$(date -u "+%T %Z")] '
 
-# Kokoro checks out the repository here.
-WORKDIR="${KOKORO_ARTIFACTS_DIR?}/github/iree"
+source "${KOKORO_ARTIFACTS_DIR?}/github/iree/build_tools/kokoro/gcp_ubuntu/docker_common.sh"
 
-# Mount the checked out repository, make that the working directory and run the
-# tests in the bazel-tensorflow image.
-docker run \
-  --volume "${WORKDIR?}:${WORKDIR?}" \
-  --workdir="${WORKDIR?}" \
-  --rm \
+# Sets DOCKER_RUN_ARGS
+docker_setup
+
+docker run "${DOCKER_RUN_ARGS[@]?}" \
   gcr.io/iree-oss/bazel-tensorflow:prod \
-  kokoro/gcp_ubuntu/bazel/integrations/build.sh
+  build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/integrations/build.sh
 
 # Kokoro will rsync this entire directory back to the executor orchestrating the
 # build which takes forever and is totally useless.
-sudo rm -rf "${KOKORO_ARTIFACTS_DIR?}"/*
+rm -rf "${KOKORO_ARTIFACTS_DIR?}"/*
 
 # Print out artifacts dir contents after deleting them as a coherence check.
 ls -1a "${KOKORO_ARTIFACTS_DIR?}/"
diff --git a/kokoro/gcp_ubuntu/cmake/continuous.cfg b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/integrations/common.cfg
similarity index 72%
copy from kokoro/gcp_ubuntu/cmake/continuous.cfg
copy to build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/integrations/common.cfg
index e4cc270..eb31e55 100644
--- a/kokoro/gcp_ubuntu/cmake/continuous.cfg
+++ b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/integrations/common.cfg
@@ -1,6 +1,6 @@
 # Format: //devtools/kokoro/config/proto/build.proto
 
-# Copyright 2020 Google LLC
+# Copyright 2019 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Deliberately blank as everything necessary is configured in common files, but
-# file must still exist to match corresponding (Google internal) job
-# configurations that trigger the builds.
+# Common configuration for Kokoro builds that run the integrations build with
+# bazel on linux.
+
+build_file: "iree/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/integrations/build_kokoro.sh"
diff --git a/kokoro/gcp_ubuntu/bazel/integrations/continuous.cfg b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/integrations/google.cfg
similarity index 100%
copy from kokoro/gcp_ubuntu/bazel/integrations/continuous.cfg
copy to build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/integrations/google.cfg
diff --git a/kokoro/gcp_ubuntu/bazel/integrations/continuous.cfg b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/integrations/main.cfg
similarity index 100%
copy from kokoro/gcp_ubuntu/bazel/integrations/continuous.cfg
copy to build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/integrations/main.cfg
diff --git a/kokoro/gcp_ubuntu/bazel/integrations/continuous.cfg b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/integrations/presubmit.cfg
similarity index 100%
copy from kokoro/gcp_ubuntu/bazel/integrations/continuous.cfg
copy to build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/integrations/presubmit.cfg
diff --git a/kokoro/gcp_ubuntu/cmake/android/arm64-v8a/build_kokoro.sh b/build_tools/kokoro/gcp_ubuntu/cmake/android/arm64-v8a/build_kokoro.sh
old mode 100644
new mode 100755
similarity index 73%
rename from kokoro/gcp_ubuntu/cmake/android/arm64-v8a/build_kokoro.sh
rename to build_tools/kokoro/gcp_ubuntu/cmake/android/arm64-v8a/build_kokoro.sh
index f912073..329e226
--- a/kokoro/gcp_ubuntu/cmake/android/arm64-v8a/build_kokoro.sh
+++ b/build_tools/kokoro/gcp_ubuntu/cmake/android/arm64-v8a/build_kokoro.sh
@@ -16,28 +16,28 @@
 
 # Cross-compile the project towards Android arm64-v8a with the
 # gcr.io/iree-oss/cmake-android image using Kokoro.
+# Requires the environment variables KOKORO_ROOT and KOKORO_ARTIFACTS_DIR, which
+# are set by Kokoro.
 
-set -e
 set -x
+set -e
+set -o pipefail
 
 # Print the UTC time when set -x is on
 export PS4='[$(date -u "+%T %Z")] '
 
-# Kokoro checks out the repository here.
-WORKDIR=${KOKORO_ARTIFACTS_DIR?}/github/iree
+source "${KOKORO_ARTIFACTS_DIR?}/github/iree/build_tools/kokoro/gcp_ubuntu/docker_common.sh"
 
-# Mount the checked out repository, make that the working directory and run the
-# tests in the cmake-android image.
-docker run \
-  --volume "${WORKDIR?}:${WORKDIR?}" \
-  --workdir="${WORKDIR?}" \
-  --rm \
+# Sets DOCKER_RUN_ARGS
+docker_setup
+
+docker run "${DOCKER_RUN_ARGS[@]?}" \
   gcr.io/iree-oss/cmake-android:prod \
-  kokoro/gcp_ubuntu/cmake/android/build.sh arm64-v8a
+  build_tools/kokoro/gcp_ubuntu/cmake/android/build.sh arm64-v8a
 
 # Kokoro will rsync this entire directory back to the executor orchestrating the
 # build which takes forever and is totally useless.
-sudo rm -rf "${KOKORO_ARTIFACTS_DIR?}"/*
+rm -rf "${KOKORO_ARTIFACTS_DIR?}"/*
 
 # Print out artifacts dir contents after deleting them as a coherence check.
 ls -1a "${KOKORO_ARTIFACTS_DIR?}/"
diff --git a/kokoro/gcp_ubuntu/cmake/continuous.cfg b/build_tools/kokoro/gcp_ubuntu/cmake/android/arm64-v8a/common.cfg
similarity index 75%
copy from kokoro/gcp_ubuntu/cmake/continuous.cfg
copy to build_tools/kokoro/gcp_ubuntu/cmake/android/arm64-v8a/common.cfg
index e4cc270..1376e08 100644
--- a/kokoro/gcp_ubuntu/cmake/continuous.cfg
+++ b/build_tools/kokoro/gcp_ubuntu/cmake/android/arm64-v8a/common.cfg
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Deliberately blank as everything necessary is configured in common files, but
-# file must still exist to match corresponding (Google internal) job
-# configurations that trigger the builds.
+# Common configuration for Kokoro builds that cross-compile IREE towards
+# Android arm64-v8a using CMake.
+
+build_file: "iree/build_tools/kokoro/gcp_ubuntu/cmake/android/arm64-v8a/build_kokoro.sh"
diff --git a/kokoro/gcp_ubuntu/cmake/android/arm64-v8a/continuous.cfg b/build_tools/kokoro/gcp_ubuntu/cmake/android/arm64-v8a/google.cfg
similarity index 100%
copy from kokoro/gcp_ubuntu/cmake/android/arm64-v8a/continuous.cfg
copy to build_tools/kokoro/gcp_ubuntu/cmake/android/arm64-v8a/google.cfg
diff --git a/kokoro/gcp_ubuntu/cmake/android/arm64-v8a/continuous.cfg b/build_tools/kokoro/gcp_ubuntu/cmake/android/arm64-v8a/main.cfg
similarity index 100%
copy from kokoro/gcp_ubuntu/cmake/android/arm64-v8a/continuous.cfg
copy to build_tools/kokoro/gcp_ubuntu/cmake/android/arm64-v8a/main.cfg
diff --git a/kokoro/gcp_ubuntu/cmake/android/arm64-v8a/continuous.cfg b/build_tools/kokoro/gcp_ubuntu/cmake/android/arm64-v8a/presubmit.cfg
similarity index 100%
copy from kokoro/gcp_ubuntu/cmake/android/arm64-v8a/continuous.cfg
copy to build_tools/kokoro/gcp_ubuntu/cmake/android/arm64-v8a/presubmit.cfg
diff --git a/kokoro/gcp_ubuntu/cmake/android/build.sh b/build_tools/kokoro/gcp_ubuntu/cmake/android/build.sh
similarity index 100%
rename from kokoro/gcp_ubuntu/cmake/android/build.sh
rename to build_tools/kokoro/gcp_ubuntu/cmake/android/build.sh
diff --git a/kokoro/gcp_ubuntu/cmake/linux/x86-turing/build.sh b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/build.sh
similarity index 67%
copy from kokoro/gcp_ubuntu/cmake/linux/x86-turing/build.sh
copy to build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/build.sh
index a56c847..ddd5a74 100755
--- a/kokoro/gcp_ubuntu/cmake/linux/x86-turing/build.sh
+++ b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/build.sh
@@ -29,9 +29,20 @@
 "$CXX" --version
 python3 --version
 
+# For some reason the environment variable set in base `cmake` image cannot
+# reach the child `cmake-swiftshader` image. Given this environment variable
+# is just a temporary solution, duplicate it here instead of spending all
+# the effort trying to figure out why.
+# TODO(#2645): remove this once we have a better solution for AOT linker
+# discovery.
+export IREE_LLVMAOT_LINKER_PATH=/usr/bin/ld
+
 # Print Vulkan related information: SDK version and GPU ICD version
-vulkaninfo 2>/dev/null | grep "Vulkan Instance"
-vulkaninfo 2>/dev/null | grep -A7 "VkPhysicalDeviceProperties"
+vulkaninfo 2>/dev/null | grep "Vulkan Instance" || echo "Vulkan Instance not found!"
+vulkaninfo 2>/dev/null | grep -A7 "VkPhysicalDeviceProperties"  || echo "VkPhysicalDeviceProperties not found!"
+
+# Print SwiftShader git commit
+cat /swiftshader/git-commit
 
 echo "Initializing submodules"
 ./scripts/git/submodule_versions.py init
diff --git a/kokoro/gcp_ubuntu/bazel/core/build_kokoro.sh b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/build_kokoro.sh
old mode 100644
new mode 100755
similarity index 62%
copy from kokoro/gcp_ubuntu/bazel/core/build_kokoro.sh
copy to build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/build_kokoro.sh
index 031eab8..82426ec
--- a/kokoro/gcp_ubuntu/bazel/core/build_kokoro.sh
+++ b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/build_kokoro.sh
@@ -14,30 +14,31 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Build and test IREE's core within the gcr.io/iree-oss/bazel image using
-# Kokoro.
+# Build and test the project within the gcr.io/iree-oss/cmake-swiftshader image
+# using Kokoro.
+# Requires the environment variables KOKORO_ROOT and KOKORO_ARTIFACTS_DIR, which
+# are set by Kokoro.
 
-set -e
 set -x
+set -e
+set -o pipefail
 
 # Print the UTC time when set -x is on
 export PS4='[$(date -u "+%T %Z")] '
 
-# Kokoro checks out the repository here.
-WORKDIR="${KOKORO_ARTIFACTS_DIR?}/github/iree"
+source "${KOKORO_ARTIFACTS_DIR?}/github/iree/build_tools/kokoro/gcp_ubuntu/docker_common.sh"
 
-# Mount the checked out repository, make that the working directory and run the
-# tests in the bazel image.
-docker run \
-  --volume "${WORKDIR?}:${WORKDIR?}" \
-  --workdir="${WORKDIR?}" \
-  --rm \
-  gcr.io/iree-oss/bazel:prod \
-  kokoro/gcp_ubuntu/bazel/core/build.sh
+# Sets DOCKER_RUN_ARGS
+docker_setup
+
+docker run "${DOCKER_RUN_ARGS[@]?}" \
+  --env IREE_VULKAN_DISABLE=0 \
+  gcr.io/iree-oss/cmake-swiftshader:prod \
+  build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/build.sh
 
 # Kokoro will rsync this entire directory back to the executor orchestrating the
 # build which takes forever and is totally useless.
-sudo rm -rf "${KOKORO_ARTIFACTS_DIR?}"/*
+rm -rf "${KOKORO_ARTIFACTS_DIR?}"/*
 
 # Print out artifacts dir contents after deleting them as a coherence check.
 ls -1a "${KOKORO_ARTIFACTS_DIR?}/"
diff --git a/kokoro/gcp_ubuntu/cmake/continuous.cfg b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/common.cfg
similarity index 76%
rename from kokoro/gcp_ubuntu/cmake/continuous.cfg
rename to build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/common.cfg
index e4cc270..9f4a82c 100644
--- a/kokoro/gcp_ubuntu/cmake/continuous.cfg
+++ b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/common.cfg
@@ -14,6 +14,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Deliberately blank as everything necessary is configured in common files, but
-# file must still exist to match corresponding (Google internal) job
-# configurations that trigger the builds.
+build_file: "iree/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/build_kokoro.sh"
diff --git a/kokoro/gcp_ubuntu/cmake/android/arm64-v8a/continuous.cfg b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/continuous.cfg
similarity index 100%
rename from kokoro/gcp_ubuntu/cmake/android/arm64-v8a/continuous.cfg
rename to build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/continuous.cfg
diff --git a/kokoro/gcp_ubuntu/cmake/android/arm64-v8a/continuous.cfg b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/google.cfg
similarity index 100%
copy from kokoro/gcp_ubuntu/cmake/android/arm64-v8a/continuous.cfg
copy to build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/google.cfg
diff --git a/kokoro/gcp_ubuntu/cmake/android/arm64-v8a/continuous.cfg b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/main.cfg
similarity index 100%
copy from kokoro/gcp_ubuntu/cmake/android/arm64-v8a/continuous.cfg
copy to build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/main.cfg
diff --git a/kokoro/gcp_ubuntu/cmake/android/arm64-v8a/continuous.cfg b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/presubmit.cfg
similarity index 100%
copy from kokoro/gcp_ubuntu/cmake/android/arm64-v8a/continuous.cfg
copy to build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/presubmit.cfg
diff --git a/kokoro/gcp_ubuntu/cmake/linux/x86-turing/build.sh b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-turing/build.sh
similarity index 87%
rename from kokoro/gcp_ubuntu/cmake/linux/x86-turing/build.sh
rename to build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-turing/build.sh
index a56c847..f44aa82 100755
--- a/kokoro/gcp_ubuntu/cmake/linux/x86-turing/build.sh
+++ b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-turing/build.sh
@@ -30,8 +30,8 @@
 python3 --version
 
 # Print Vulkan related information: SDK version and GPU ICD version
-vulkaninfo 2>/dev/null | grep "Vulkan Instance"
-vulkaninfo 2>/dev/null | grep -A7 "VkPhysicalDeviceProperties"
+vulkaninfo 2>/dev/null | grep "Vulkan Instance" || echo "Vulkan Instance not found!"
+vulkaninfo 2>/dev/null | grep -A7 "VkPhysicalDeviceProperties"  || echo "VkPhysicalDeviceProperties not found!"
 
 echo "Initializing submodules"
 ./scripts/git/submodule_versions.py init
diff --git a/kokoro/gcp_ubuntu/cmake/linux/x86-turing/build_kokoro.sh b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-turing/build_kokoro.sh
similarity index 64%
rename from kokoro/gcp_ubuntu/cmake/linux/x86-turing/build_kokoro.sh
rename to build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-turing/build_kokoro.sh
index 051c0d0..0b2364a 100755
--- a/kokoro/gcp_ubuntu/cmake/linux/x86-turing/build_kokoro.sh
+++ b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-turing/build_kokoro.sh
@@ -14,32 +14,32 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Build and test the project within the gcr.io/iree-oss/cmake using Kokoro.
+# Build and test the project within the gcr.io/iree-oss/cmake image using
+# Kokoro.
+# Requires the environment variables KOKORO_ROOT and KOKORO_ARTIFACTS_DIR, which
+# are set by Kokoro.
 
-set -e
 set -x
+set -e
+set -o pipefail
 
 # Print the UTC time when set -x is on
 export PS4='[$(date -u "+%T %Z")] '
 
-# Kokoro checks out the repository here.
-WORKDIR=${KOKORO_ARTIFACTS_DIR?}/github/iree
+source "${KOKORO_ARTIFACTS_DIR?}/github/iree/build_tools/kokoro/gcp_ubuntu/docker_common.sh"
 
-# Mount the checked out repository, make that the working directory and run the
-# tests in the cmake image.
-docker run \
-  --volume "${WORKDIR?}:${WORKDIR?}" \
-  --workdir="${WORKDIR?}" \
-  --rm \
+# Sets DOCKER_RUN_ARGS
+docker_setup
+
+docker run "${DOCKER_RUN_ARGS[@]?}" \
   --env IREE_VULKAN_DISABLE=0 \
   --gpus all \
   gcr.io/iree-oss/cmake-nvidia:prod \
-  kokoro/gcp_ubuntu/cmake/linux/x86-turing/build.sh
+  build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-turing/build.sh
 
 # Kokoro will rsync this entire directory back to the executor orchestrating the
 # build which takes forever and is totally useless.
-# TODO: enable this after making it work
-#sudo rm -rf "${KOKORO_ARTIFACTS_DIR?}"/*
+rm -rf "${KOKORO_ARTIFACTS_DIR?}"/*
 
 # Print out artifacts dir contents after deleting them as a coherence check.
-#ls -1a "${KOKORO_ARTIFACTS_DIR?}/"
+ls -1a "${KOKORO_ARTIFACTS_DIR?}/"
diff --git a/kokoro/gcp_ubuntu/cmake/continuous.cfg b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-turing/common.cfg
similarity index 76%
copy from kokoro/gcp_ubuntu/cmake/continuous.cfg
copy to build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-turing/common.cfg
index e4cc270..bdb9163 100644
--- a/kokoro/gcp_ubuntu/cmake/continuous.cfg
+++ b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-turing/common.cfg
@@ -14,6 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Deliberately blank as everything necessary is configured in common files, but
-# file must still exist to match corresponding (Google internal) job
-# configurations that trigger the builds.
+# Common configuration for Kokoro builds that run cmake on linux.
+
+build_file: "iree/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-turing/build_kokoro.sh"
diff --git a/kokoro/gcp_ubuntu/cmake/linux/x86-turing/continuous.cfg b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-turing/google.cfg
similarity index 100%
rename from kokoro/gcp_ubuntu/cmake/linux/x86-turing/continuous.cfg
rename to build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-turing/google.cfg
diff --git a/kokoro/gcp_ubuntu/cmake/linux/x86-turing/continuous.cfg b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-turing/main.cfg
similarity index 100%
copy from kokoro/gcp_ubuntu/cmake/linux/x86-turing/continuous.cfg
copy to build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-turing/main.cfg
diff --git a/kokoro/gcp_ubuntu/cmake/linux/x86-turing/continuous.cfg b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-turing/presubmit.cfg
similarity index 100%
copy from kokoro/gcp_ubuntu/cmake/linux/x86-turing/continuous.cfg
copy to build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-turing/presubmit.cfg
diff --git a/kokoro/gcp_ubuntu/cmake/build.sh b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86/build.sh
similarity index 100%
rename from kokoro/gcp_ubuntu/cmake/build.sh
rename to build_tools/kokoro/gcp_ubuntu/cmake/linux/x86/build.sh
diff --git a/kokoro/gcp_ubuntu/cmake/build_kokoro.sh b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86/build_kokoro.sh
similarity index 72%
rename from kokoro/gcp_ubuntu/cmake/build_kokoro.sh
rename to build_tools/kokoro/gcp_ubuntu/cmake/linux/x86/build_kokoro.sh
index 8c35e60..ee459ff 100755
--- a/kokoro/gcp_ubuntu/cmake/build_kokoro.sh
+++ b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86/build_kokoro.sh
@@ -15,28 +15,28 @@
 # limitations under the License.
 
 # Build and test the project within the gcr.io/iree-oss/cmake using Kokoro.
+# Requires the environment variables KOKORO_ROOT and KOKORO_ARTIFACTS_DIR, which
+# are set by Kokoro.
 
-set -e
 set -x
+set -e
+set -o pipefail
 
 # Print the UTC time when set -x is on
 export PS4='[$(date -u "+%T %Z")] '
 
-# Kokoro checks out the repository here.
-WORKDIR=${KOKORO_ARTIFACTS_DIR?}/github/iree
+source "${KOKORO_ARTIFACTS_DIR?}/github/iree/build_tools/kokoro/gcp_ubuntu/docker_common.sh"
 
-# Mount the checked out repository, make that the working directory and run the
-# tests in the cmake image.
-docker run \
-  --volume "${WORKDIR?}:${WORKDIR?}" \
-  --workdir="${WORKDIR?}" \
-  --rm \
+# Sets DOCKER_RUN_ARGS
+docker_setup
+
+docker run "${DOCKER_RUN_ARGS[@]?}" \
   gcr.io/iree-oss/cmake:prod \
-  kokoro/gcp_ubuntu/cmake/build.sh
+  build_tools/kokoro/gcp_ubuntu/cmake/linux/x86/build.sh
 
 # Kokoro will rsync this entire directory back to the executor orchestrating the
 # build which takes forever and is totally useless.
-sudo rm -rf "${KOKORO_ARTIFACTS_DIR?}"/*
+rm -rf "${KOKORO_ARTIFACTS_DIR?}"/*
 
 # Print out artifacts dir contents after deleting them as a coherence check.
 ls -1a "${KOKORO_ARTIFACTS_DIR?}/"
diff --git a/kokoro/gcp_ubuntu/cmake/continuous.cfg b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86/common.cfg
similarity index 76%
copy from kokoro/gcp_ubuntu/cmake/continuous.cfg
copy to build_tools/kokoro/gcp_ubuntu/cmake/linux/x86/common.cfg
index e4cc270..49e6865 100644
--- a/kokoro/gcp_ubuntu/cmake/continuous.cfg
+++ b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86/common.cfg
@@ -14,6 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Deliberately blank as everything necessary is configured in common files, but
-# file must still exist to match corresponding (Google internal) job
-# configurations that trigger the builds.
+# Common configuration for Kokoro builds that run cmake on linux.
+
+build_file: "iree/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86/build_kokoro.sh"
diff --git a/kokoro/gcp_ubuntu/cmake/android/arm64-v8a/continuous.cfg b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86/google.cfg
similarity index 100%
copy from kokoro/gcp_ubuntu/cmake/android/arm64-v8a/continuous.cfg
copy to build_tools/kokoro/gcp_ubuntu/cmake/linux/x86/google.cfg
diff --git a/kokoro/gcp_ubuntu/cmake/android/arm64-v8a/continuous.cfg b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86/main.cfg
similarity index 100%
copy from kokoro/gcp_ubuntu/cmake/android/arm64-v8a/continuous.cfg
copy to build_tools/kokoro/gcp_ubuntu/cmake/linux/x86/main.cfg
diff --git a/kokoro/gcp_ubuntu/cmake/android/arm64-v8a/continuous.cfg b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86/presubmit.cfg
similarity index 100%
copy from kokoro/gcp_ubuntu/cmake/android/arm64-v8a/continuous.cfg
copy to build_tools/kokoro/gcp_ubuntu/cmake/linux/x86/presubmit.cfg
diff --git a/build_tools/kokoro/gcp_ubuntu/docker_common.sh b/build_tools/kokoro/gcp_ubuntu/docker_common.sh
new file mode 100644
index 0000000..d0b15dd
--- /dev/null
+++ b/build_tools/kokoro/gcp_ubuntu/docker_common.sh
@@ -0,0 +1,99 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Functions for setting up Docker containers to run on Kokoro
+
+# Sets up files and environment to enable running all our Kokoro docker scripts.
+# In particular, does some shenanigans to enable running with the current user.
+# Some of this setup is only strictly necessary for Bazel, but it doesn't hurt
+# for anything else.
+# Requires that KOKORO_ROOT and KOKORO_ARTIFACTS_DIR have been set
+# Sets the environment variable DOCKER_RUN_ARGS to be used by subsequent
+# `docker run` invocations.
+function docker_setup() {
+    # Make the source repository available and launch containers in that
+    # directory.
+    local workdir="${KOKORO_ARTIFACTS_DIR?}/github/iree"
+    DOCKER_RUN_ARGS=(
+      --volume="${workdir?}:${workdir?}"
+      --workdir="${workdir?}"
+    )
+
+    # Delete the container after the run is complete.
+    DOCKER_RUN_ARGS+=(--rm)
+
+
+    # Run as the current user and group. If only it were this simple...
+    DOCKER_RUN_ARGS+=(--user="$(id -u):$(id -g)")
+
+
+    # The Docker container doesn't know about the users and groups of the host
+    # system. We have to tell it. This is just a mapping of IDs to names though.
+    # The thing that really matters is the IDs, so the key thing is that Docker
+    # writes files as the same ID as the current user, which we set above, but
+    # without the group and passwd file, lots of things get upset because they
+    # don't recognize the current user ID (e.g. `whoami` fails). Bazel in
+    # particular looks for a home directory and is not happy when it can't find
+    # one.
+    # So we make the container share the host mapping, which guarantees that the
+    # current user is mapped. If there was any user or group in the container
+    # that we cared about, this wouldn't necessarily work because the host and
+    # container don't necessarily map the ID to the same user. Luckily though,
+    # we don't.
+    # We don't just mount the real /etc/passwd and /etc/group because Google
+    # Linux workstations do some interesting stuff with user/group permissions
+    # such that they don't contain the information about normal users and we
+    # want these scripts to be runnable locally for debugging.
+    # Instead we dump the results of `getent` to some fake files.
+    local fake_etc_dir="${KOKORO_ROOT?}/fake_etc"
+    mkdir -p "${fake_etc_dir?}"
+
+    local fake_group="${fake_etc_dir?}/group"
+    local fake_passwd="${fake_etc_dir?}/passwd"
+
+    getent group > "${fake_group?}"
+    getent passwd > "${fake_passwd?}"
+
+    DOCKER_RUN_ARGS+=(
+      --volume="${fake_group?}:/etc/group:ro"
+      --volume="${fake_passwd?}:/etc/passwd:ro"
+    )
+
+
+    # Bazel stores its cache in the user home directory by default. It's
+    # possible to override this, but that would require changing our Bazel
+    # startup options, which means polluting all our scripts and making them not
+    # runnable locally. Instead, we give it a special home directory to write
+    # into. We don't just mount the user home directory (or some subset thereof)
+    # for two reasons:
+    #   1. We probably don't want Docker to just write into the user's home
+    #      directory when running locally.
+    #   2. When running with Kokoro, we mount a local scratch SSD to KOKORO_ROOT
+    #      whereas the home directory is on the persistent SSD boot disk. It
+    #      turns out that makes a huge difference in performance for Bazel
+    #      running with local execution (not with RBE) because it is IO bound at
+    #      64 cores.
+    local fake_home_dir="${KOKORO_ROOT?}/fake_home"
+    mkdir -p "${fake_home_dir}"
+
+    DOCKER_RUN_ARGS+=(
+      --volume="${fake_home_dir?}:${HOME?}"
+    )
+
+    # Make gcloud credentials available. This isn't necessary when running in
+    # GCE but enables using this script locally with RBE.
+    DOCKER_RUN_ARGS+=(
+      --volume="${HOME?}/.config/gcloud:${HOME?}/.config/gcloud:ro"
+    )
+}
diff --git a/build_tools/kokoro/gcp_ubuntu/simulate_kokoro.sh b/build_tools/kokoro/gcp_ubuntu/simulate_kokoro.sh
new file mode 100755
index 0000000..38c6782
--- /dev/null
+++ b/build_tools/kokoro/gcp_ubuntu/simulate_kokoro.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Simulates the behavior of Kokoro on a local machine.
+# Usage:
+#   ./kokoro/gcp_ubuntu/simulate_kokoro.sh build_tools/kokoro/gcp_ubuntu/bazel/core/build.sh
+#
+# Just does the part of the Kokoro setup that we care about and invokes the
+# given build script.
+# An optional second parameter can be used to specify a different repo to clone
+# from. Especially useful for cloning the current git repo.
+#   ./kokoro/gcp_ubuntu/simulate_kokoro.sh build_tools/kokoro/gcp_ubuntu/bazel/core/build.sh "$PWD/.git"
+
+set -x
+set -e
+set -o pipefail
+
+RELATIVE_KOKORO_BUILD_SCRIPT="${1?}"
+REPO_TO_CLONE="${2:-git@github.com:google/iree.git}"
+
+# Set up the temporary Kokoro directories
+export KOKORO_ROOT="$(mktemp --directory --tmpdir kokoro-root-XXXXXX)"
+mkdir -p "${KOKORO_ROOT?}/src/github"
+export KOKORO_ARTIFACTS_DIR="${KOKORO_ROOT?}/src"
+cd "${KOKORO_ARTIFACTS_DIR?}/github"
+
+# Clone the repo
+git clone "${REPO_TO_CLONE?}"
+
+# The build script is assumed to be relative to the iree repo root.
+KOKORO_BUILD_SCRIPT="${KOKORO_ARTIFACTS_DIR?}/github/iree/${RELATIVE_KOKORO_BUILD_SCRIPT?}"
+chmod +x "${KOKORO_BUILD_SCRIPT?}"
+
+# This is where Kokoro starts its execution.
+cd "${KOKORO_ARTIFACTS_DIR?}"
+
+# Run the actual script.
+"${KOKORO_BUILD_SCRIPT?}"
+
+# Clean up after ourselves.
+rm -rf "${KOKORO_ROOT?}"
diff --git a/build_tools/manylinux_py_setup.py b/build_tools/manylinux_py_setup.py
new file mode 100755
index 0000000..abdd11d
--- /dev/null
+++ b/build_tools/manylinux_py_setup.py
@@ -0,0 +1,86 @@
+#!/opt/python/cp38-cp38/bin/python3
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Generates CMake arguments to build all manylinux python versions.
+
+manylinux containers have all python version linked under /opt/python.
+This script scrapes them to get configuration, install deps, etc.
+
+Usage:
+  Install dependencies:
+    manylinux_py_setup.py deps
+  Get CMake arguments to build (typically via $() expansion):
+    manylinux_py_setup.py args
+"""
+
+import os
+from pathlib import Path
+import subprocess
+import sys
+import sysconfig
+
+
+def _get_python_exes():
+  PYTHON_PARENT_PATH = Path("/opt/python")
+  return PYTHON_PARENT_PATH.glob("*/bin/python")
+
+
+def install_deps():
+  for python_exe in _get_python_exes():
+    args = [
+        str(python_exe),
+        "-m",
+        "pip",
+        "install",
+        "absl-py",
+        "numpy",
+    ]
+    print("EXEC:", " ".join(args))
+    subprocess.check_call(args)
+
+
+def dump_current(identifier):
+  print("-DIREE_MULTIPY_{}_EXECUTABLE='{}'".format(identifier, sys.executable))
+  print("-DIREE_MULTIPY_{}_INCLUDE_DIRS='{}'".format(
+      identifier, sysconfig.get_config_var("INCLUDEPY")))
+  # TODO: Print LIBRARIES for Windows and OSX
+  print("-DIREE_MULTIPY_{}_EXTENSION='{}'".format(
+      identifier, sysconfig.get_config_var("EXT_SUFFIX")))
+
+
+def dump_all():
+  versions_ids = []
+  for python_exe in _get_python_exes():
+    identifier = python_exe.parent.parent.name
+    versions_ids.append(identifier)
+    # Invoke ourselves with a different interpreter/args to dump config.
+    subprocess.check_call(
+        [str(python_exe), __file__, "_current_args", identifier])
+  print("-DIREE_MULTIPY_VERSIONS='{}'".format(";".join(versions_ids)))
+
+
+if __name__ == "__main__":
+  if len(sys.argv) < 2:
+    print("SYNTAX: mainlinux_py_setup.py {deps|args}")
+    sys.exit(1)
+  command = sys.argv[1]
+  if command == "args":
+    dump_all()
+  elif command == "_current_args":
+    dump_current(sys.argv[2])
+  elif command == "deps":
+    install_deps()
+  else:
+    print("Unexpected command")
+    sys.exit(1)
diff --git a/build_tools/third_party/sdl2/BUILD.overlay b/build_tools/third_party/sdl2/BUILD.overlay
index 951160a..ba663f0 100644
--- a/build_tools/third_party/sdl2/BUILD.overlay
+++ b/build_tools/third_party/sdl2/BUILD.overlay
@@ -26,8 +26,15 @@
     "src/render/direct3d*/**",
     "src/render/SDL_d3d*",
     "src/haptic/windows/**",
+    "src/locale/windows/**",
+    "src/locale/winrt/**",
+    # Ignore Android and Emscripten locales
+    "src/locale/android/**",
+    "src/locale/emscripten/**",
     # Remove support for QNX
     "src/video/qnx/**",
+    # No dbus -> don't try to compile this file
+    "src/core/linux/SDL_fcitx.c",
 ]
 
 sdl_sources = select({
@@ -42,6 +49,9 @@
             # Ignore Linux/Unix
             "src/core/linux/**",
             "src/core/unix/**",
+            # Ignore Android and Emscripten locales
+            "src/locale/android/**",
+            "src/locale/emscripten/**",
             # Ignore thread (exclude for thread/windows)
             "src/thread/**",
             # Remove support for QNX
@@ -86,6 +96,7 @@
     "include/SDL_keyboard.h",
     "include/SDL_keycode.h",
     "include/SDL_loadso.h",
+    "include/SDL_locale.h",
     "include/SDL_log.h",
     "include/SDL_main.h",
     "include/SDL_messagebox.h",
diff --git a/build_tools/third_party/sdl2/SDL_config_windows.h b/build_tools/third_party/sdl2/SDL_config_windows.h
index 460fb3b..7626de6 100644
--- a/build_tools/third_party/sdl2/SDL_config_windows.h
+++ b/build_tools/third_party/sdl2/SDL_config_windows.h
@@ -227,6 +227,9 @@
 /* Enable filesystem support */
 #define SDL_FILESYSTEM_WINDOWS 1
 
+/* Disable sensor support */
+#define SDL_SENSOR_DISABLED 1
+
 /* Enable assembly routines (Win64 doesn't have inline asm) */
 #ifndef _WIN64
 #define SDL_ASSEMBLY_ROUTINES 1
diff --git a/build_tools/third_party/swiftshader/build_vk_swiftshader.sh b/build_tools/third_party/swiftshader/build_vk_swiftshader.sh
old mode 100644
new mode 100755
diff --git a/colab/README.md b/colab/README.md
index d35df2a..b0bfe52 100644
--- a/colab/README.md
+++ b/colab/README.md
@@ -1,7 +1,7 @@
 # Google Colaboratory (Colab) Notebooks
 
 To run these notebooks with a local runtime, refer to the
-[Using Colab docs](../docs/using_colab.md).
+[Using Colab docs](../docs/using_iree/using_colab.md).
 
 Hosted/remote runtimes are not yet supported.
 
diff --git a/docs/design_docs/codegen_passes.md b/docs/design_docs/codegen_passes.md
new file mode 100644
index 0000000..83e37fc
--- /dev/null
+++ b/docs/design_docs/codegen_passes.md
@@ -0,0 +1,640 @@
+# IREE CPU/GPU Code Generation Pipeline
+
+This document is intended to provide an overview of the codegen pipeline within
+IREE used to generate CPU/GPU code. It intends to give an overview of the main
+passes used, the objective of the pass, the current implementation, and what it
+is expected to achieve in the long term.
+
+Note that while the code generation pipeline supports dynamic shapes, this work
+is very preliminary. The description of this is not covered here.
+
+## Input to the codegen pipeline
+
+The input to the code generation pipeline is the module within the
+`hal.executable.target` operation. Functions within this module that do __not__
+have `Visibility::Private` are the *entry point* functions of the dispatch
+region. These are the functions that are *invoked* by the IREE runtime. In
+addition, each dispatch region also contains a `hal.interface` operation that
+describes the ABI to use for the dispatch region. Two examples of the input to
+the code generation pipeline are shown below. In both of these, a single
+dispatch function contains a sequence of MHLO operations that the dispatch
+region creation has grouped into a single region. Ideally the grouped operations
+are fused into a single kernel.
+
+```mlir
+hal.executable.target "vulkan*" {
+  module attributes {spv.target_env = ...} {
+    func @main_ex_dispatch() {
+      %c0 = constant 0 : index
+      %0 = hal.interface.load.tensor @legacy_io::@arg0,
+             offset = %c0 : tensor<4x5xf32>
+      %1 = hal.interface.load.tensor @legacy_io::@arg1,
+             offset = %c0 : tensor<5x10xf32>
+      %2 = "mhlo.dot"(%0, %1) {precision_config = ["DEFAULT", "DEFAULT"]} :
+             (tensor<4x5xf32>, tensor<5x10xf32>) -> tensor<4x10xf32>
+      hal.interface.store.tensor %2, @legacy_io::@ret0,
+        offset = %c0 : tensor<4x10xf32>
+      return
+    }
+    hal.interface @legacy_io attributes {sym_visibility = "private"} {
+      hal.interface.binding @arg0, set=0, binding=0,
+        type="StorageBuffer", access="Read"
+      hal.interface.binding @arg1, set=0, binding=1,
+        type="StorageBuffer", access="Read"
+      hal.interface.binding @ret0, set=0, binding=2,
+        type="StorageBuffer", access="Write|Discard"
+    }
+  }
+}
+```
+
+<a name="snippet1"></a> Snippet 1 : Dispatch region with matrix-matrix multiply
+operation.
+
+```mlir
+hal.executable.target "vulkan*" {
+  module attributes {spv.target_env = ...} {
+    func @main_ex_dispatch() {
+      %c0 = constant 0 : index
+      %0 = hal.interface.load.tensor @legacy_io::@arg0,
+             offset = %c0 : tensor<10x5xf32>
+      %1 = hal.interface.load.tensor @legacy_io::@arg1,
+             offset = %c0 : tensor<10x5xf32>
+      %2 = hal.interface.load.tensor @legacy_io::@arg2,
+             offset = %c0 : tensor<10x5xf32>
+      %3 = "mhlo.add"(%0, %1) :
+         (tensor<10x5xf32>, tensor<10x5xf32>) -> tensor<10x5xf32>
+      %4 = "mhlo.multiply"(%3, %2) :
+         (tensor<10x5xf32>, tensor<10x5xf32>) -> tensor<10x5xf32>
+      hal.interface.store.tensor %4, @legacy_io::@ret0,
+        offset = %c0 : tensor<10x5xf32>
+      return
+    }
+    hal.interface @legacy_io attributes {sym_visibility = "private"} {
+      hal.interface.binding @arg0, set=0, binding=0,
+        type="StorageBuffer", access="Read"
+      hal.interface.binding @arg1, set=0, binding=1,
+        type="StorageBuffer", access="Read"
+      hal.interface.binding @arg2, set=0, binding=2,
+        type="StorageBuffer", access="Read"
+      hal.interface.binding @ret0, set=0, binding=3,
+        type="StorageBuffer", access="Write|Discard"
+    }
+  }
+}
+```
+
+<a name="snippet2"></a> Snippet 2 : Dispatch region with element-wise
+operations.
+
+__Roadmap Note__: The current implementation might not actually fuse the
+operations grouped into a dispatch region into a single kernel. It is possible
+to end up with multiple kernels per dispatch region. Over time we plan to
+address this by using fusion at different levels (see below).
+
+The inputs to the dispatch region are materialized within the entry point
+function using the `hal.interface.load.tensor` operation, This operation returns
+a `tensor` view of the buffer used to store the inputs. Similarly the result of
+the dispatch region are *written* out using the `hal.interface.store.tensor`
+operation.
+
+The main constraint that the code generation operates under is that it should
+not require additional (temporary) buffers to execute the operations grouped
+together within a dispatch region. The rationale behind this constraint is that
+buffer allocation/synchronization in IREE happens at the granularity of dispatch
+regions, allowing the scheduler to make better decision about where to insert
+appropriate synchronizations.
+
+The IR after all the passes used in the lowering from MHLO to SPIR-V for the
+above two examples can be found here ([matrix-matrix multiply op][DotAfterAll],
+[elementwise ops][PwAfterAll]). Below is a description of the major passes used.
+
+## Conversion from MHLO dialect to Linalg on buffers
+
+The code generation pipeline heavily relies on use of
+[Structured Operations][LinalgRationale], specifically the
+[Linalg Dialect][LinalgDialect]. Both, the Linalg operations on `tensor`s and on
+`memref`s are central to the progressive lowering approach followed here. The
+first part of the code generation pipeline is to convert the MHLO operations on
+`tensor`s to Linalg operation on `memref`s. This part of the pipeline is common
+to both CPU and GPU code generation.
+
+The steps involved in this conversion is shown below. Each of the arrows
+represents a pass in the pipeline:
+
+![MHLO To Linalg on `memref` conversion](./hlo_to_linalg.png)
+
+The next sections describe each of these passes in more detail.
+
+### MHLO to Linalg on tensors
+
+The first step is to convert MHLO operations to Linalg on tensors. This is done
+using the [HLOToLinalgPass][HLOToLinalgPass] from Tensorflow. An example of the
+conversion is shown below, where each of the `mhlo.add` and `mhlo.multiply`
+operations are converted to `linalg.generic` operations on tensors.
+
+```mlir
+#map0 = affine_map<(d0, d1) -> (d0, d1)>
+%3 = linalg.generic
+       {args_in = 2 : i64, args_out = 1 : i64,
+        indexing_maps = [#map0, #map0, #map0],
+        iterator_types = ["parallel", "parallel"]} %0, %1 {
+     ^bb0(%arg0: f32, %arg1: f32):  // no predecessors
+       %5 = addf %arg0, %arg1 : f32
+       linalg.yield %5 : f32
+     } : tensor<10x5xf32>, tensor<10x5xf32> -> tensor<10x5xf32>
+%4 = linalg.generic
+       {args_in = 2 : i64, args_out = 1 : i64,
+        indexing_maps = [#map0, #map0, #map0],
+        iterator_types = ["parallel", "parallel"]} %3, %2 {
+     ^bb0(%arg0: f32, %arg1: f32):  // no predecessors
+       %5 = mulf %arg0, %arg1 : f32
+       linalg.yield %5 : f32
+     }: tensor<10x5xf32>, tensor<10x5xf32> -> tensor<10x5xf32>
+```
+
+<a name="snippet3"></a> Snippet 3 : MHLO to Linalg conversion for
+[element-wise operations](#snippet2)
+
+At the time of writing the representation of Linalg on `tensor`s does not model
+reduction iterator types completely. Specifically, the reduction in Linalg is
+modeled using read-modify-write approach, i.e. each iteration of the reduction
+loop reads the value stored in the output, adds its contribution, and writes
+back to the same location. This means the output has to be *initialized* to the
+null element of the reduction operator (i.e. 0 if the reduction is done using
+addition). This works for operations on buffers. Since tensors are SSA values
+they cannot be updated in-place. As a result, the reduction semantics does not
+map as well to `tensor`s. For now it is treated as a convention that when the
+Linalg operation is converted to use `memref`s it has to be initialized
+appropriately before performing the reduction. Due to this, the conversion from
+MHLO op to Linalg op is only done for operations which do not need a *reduction*
+iterator type in the converted Linalg op. Consequently, only element-wise
+operations, broadcast operations and data movement operations (like copy and
+transpose) are converted to Linalg operations at this stage.
+
+__Roadmap note__: One long term solution for the above is to have operations on
+tensors that have *reduction* iterator type to take an additional argument that
+contains the initial value of the result tensor. When the operation is converted
+to use `memref`s, the buffer for the initial value operand can be reused for the
+result. The details involved have not been fully worked out yet.
+
+### Fusion of Linalg on tensor operations
+
+The Linalg on `tensor` operations generated at the previous step are fused using
+the [LinalgFusionOfTensorOps][LinalgFusionOfTensorOps] from MLIR. Since
+`tensor`s are SSA values, fusion at this stage can be done without using alias
+analysis or dependence analysis based on reads and writes. Instead the use-def
+chains for the `tensor` values can be used to implement producer-consumer
+fusion. This stage fuses most elementwise operations, broadcast operations and
+data movement operations. An example of the fused op is shown below.
+
+```mlir
+#map0 = affine_map<(d0, d1) -> (d0, d1)>
+%3 = linalg.generic
+       {args_in = 3 : i64, args_out = 1 : i64,
+        indexing_maps = [#map0, #map0, #map0, #map0],
+        iterator_types = ["parallel", "parallel"]} %0, %1, %2 {
+     ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):  // no predecessors
+       %4 = addf %arg0, %arg1 : f32
+       %5 = mulf %4, %arg2 : f32
+       linalg.yield %5 : f32
+     }: tensor<?x5xf32>, tensor<?x5xf32>, tensor<?x5xf32> -> tensor<?x5xf32>
+```
+
+<a name="snippet4"></a> Snippet 4: Fusion of Linalg operation on tensors for
+element-wise operations shown in [Snippet 3](#snippet3)
+
+### Conversion of Linalg on tensors to Linalg on buffers
+
+Post fusion all the operation on `tensor`s are converted to analogous operations
+on `memref`s. In general, this requires a buffer allocation pass. In IREE,
+buffer allocation happens at the granularity of dispatch region, and as
+mentioned [earlier](#input-to-the-codegen-pipeline), the dispatch region is not
+expected to use any additional temporary buffers. So instead of having another
+buffer allocation pass within the code generation pipeline, a simpler approach
+is used within IREE:
+
+-   For each `hal.interface.store.tensor` an `iree.placeholder` operation is
+    created. The latter uses the same `hal.interface.binding` as the former, but
+    returns a `memref` view of the output of the dispatch region instead of a
+    `tensor` view. This `iree.placeholder` operation is added to start of the
+    entry point function.
+
+-   A map is constructed that for a given `tensor` records the `memref` value to
+    use during the conversion. In this map the `tensor` value used in the
+    `hal.interface.store.tensor` is mapped to the `memref` value returned by the
+    created `iree.placeholder` operation.
+
+-   The Dialect Conversion framework is used to implement a set of patterns that
+    convert from operations on `tensor`s to operation on `memref`s,
+
+    -   A `hal.interface.load.tensor`, is replaced with an `iree.placeholder` to
+        get the `memref` view of the input to the dispatch region.
+    -   All Linalg operation on `tensor`s (expected to be just `linalg.generic`
+        or `linalg.indexed_generic` operations) are converted to the
+        corresponding operation on `memref`s. Instead of returning a `tensor`
+        value the converted operation takes an additional `memref` operand as
+        argument. This `memref` is where the result of the operation is
+        populated. Current implementation looks for the `memref` to use from the
+        map constructed previously. If there is no `memref` associated with the
+        result `tensor` the conversion fails.
+    -   At this stage, any `mhlo` operation not converted to a Linalg operation
+        are directly converted to a Linalg operation on buffers. This is done
+        for operations that when converted to Linalg have a *reduction* iterator
+        type. Some examples of ops converted this way are
+
+        -   `mhlo.dot`
+        -   `mhlo.reduce`
+        -   `mhlo.conv`
+        -   `mhlo.reduce_window`.
+
+        Since the specification of the Linalg operations require the output
+        `memref` to be initialized appropriately, a `linalg.fill` operation is
+        used to achieve this.
+
+__Roadmap Note__ : Right now the code-generation pipeline relies on fusion of
+operations on tensor level. In the near future, we want to be able to fuse
+operations like `linalg.matmul` and `linalg.conv` with consumers/producers that
+are element-wise operations using the
+[fusion of Linalg operation on `memref`s][LinalgFusionOnBuffers].
+
+At this stage of the compilation all operations must have been converted to
+Linalg operations on buffers. Shown below are the IR at the end of this stage
+for the two examples in Snippets 1 and 2.
+
+```mlir
+func @main_ex_dispatch() {
+  %0 = iree.placeholder for "interface buffer"
+         {binding = @legacy_io::@ret0} : memref<4x10xf32>
+  %c0 = constant 0 : index
+  %1 = iree.placeholder for "interface buffer"
+         {binding = @legacy_io::@arg0} : memref<4x5xf32>
+  %2 = iree.placeholder for "interface buffer"
+         {binding = @legacy_io::@arg1} : memref<5x10xf32>
+  %cst = constant 0.000000e+00 : f32
+  linalg.matmul(%1, %2, %0) :
+    memref<4x5xf32>, memref<5x10xf32>, memref<4x10xf32>
+  return
+}
+```
+
+<a name="snippet5"></a> Snippet 5 : Matrix-matrix multiply after conversion to
+Linalg operation on `memref`s.
+
+```mlir
+#map0 = affine_map<(d0, d1) -> (d0, d1)>
+func @main_ex_dispatch() {
+  %0 = iree.placeholder for "interface buffer"
+         {binding = @legacy_io::@ret0} : memref<10x5xf32>
+  %c0 = constant 0 : index
+  %1 = iree.placeholder for "interface buffer"
+         {binding = @legacy_io::@arg0} : memref<10x5xf32>
+  %2 = iree.placeholder for "interface buffer"
+         {binding = @legacy_io::@arg1} : memref<10x5xf32>
+  %3 = iree.placeholder for "interface buffer"
+         {binding = @legacy_io::@arg2} : memref<10x5xf32>
+  linalg.generic
+    {args_in = 3 : i64, args_out = 1 : i64,
+     indexing_maps = [#map0, #map0, #map0],
+     iterator_types = ["parallel", "parallel"]} %1, %2, %3, %0 {
+  ^bb0(%arg0: f32, %arg1: f32, %arg2: f32, %arg3: f32):  // no predecessors
+    %4 = addf %arg0, %arg1 : f32
+    %5 = mulf %4, %arg2 : f32
+    linalg.yield %5 : f32
+  }: memref<10x5xf32>, memref<10x5xf32>, memref<10x5xf32>, memref<10x5xf32>
+  return
+}
+```
+
+<a name="snippet6"></a> Snippet 6 : Elementwise operations after conversion to
+Linalg operation on `memref`s
+
+The rest of the code-generation differs on whether the compilation is for CPU
+(using LLVM) or for GPU (using SPIR-V).
+
+## Conversion from Linalg on buffers to SPIR-V dialect
+
+The following sections describe the progressive lowering of Linalg operation on
+buffers to SPIR-V dialect. Once lowered to the SPIR-V dialect, it can be
+serialized into a SPIR-V binary using the
+[serialization mechanism provided by the SPIR-V dialect][SpirvSerialization].
+The steps involved in the lowering are described below, with each of the arrows
+representing a pass.
+
+![Linalg on `memref` to SPIR-V conversion](./linalg_to_spirv.png)
+
+These passes are described below in more detail.
+
+### Tiling and fusion on buffer operations
+
+The GPU hardware typically provides multiple-levels of compute hierarchy, namely
+*workgroup* level, *subgroup* level and *workitem* level. These map to blocks,
+warps and threads, respectively, in CUDA terminology. Tiling is a way to map the
+computations to each level of the compute hierarchy. For example 3-D tiling a
+`linalg.matmul` operation decomposes the computation into several tiled
+matrix-matrix multiplies.
+[Tiling transformation in Linalg dialect][LinalgTiling] generates the
+outer-loops that iterate over tiled `linalg.matmul` operations. These outer
+loops can be mapped to different workgroups, if they are parallel. The tiled
+`linalg.matmul` operation can be further tiled to map to subgroups. Finally, the
+tiled operation can be lowered to loops with individual iterations mapped to
+workitems. The [LinalgTileAndFusePass][LinalgTileAndFuse] uses the Linalg Tiling
+patterns ([defined here][LinalgTilingPatterns]) to tile operations like
+`linalg.matmul`, `linalg.conv` and `linalg.*_pooling`. The result of tiling the
+code in Snippet 5 is shown below. As expected there are 2-parallel loops that
+iterate over tiles of the original iteration space (i.e. inter-tile loops) and
+can be distributed to workgroups.
+
+```mlir
+func @main_ex_dispatch_0()
+  attributes {
+    spv.entry_point_abi = {local_size = dense<[8, 8, 1]> : vector<3xi32>}} {
+  %cst = constant 0.000000e+00 : f32
+  %c0 = constant 0 : index
+  %c4 = constant 4 : index
+  %c10 = constant 10 : index
+  %0 = iree.placeholder for "interface buffer"
+         {binding = @legacy_io::@ret0} : memref<4x10xf32>
+  %1 = iree.placeholder for "interface buffer"
+         {binding = @legacy_io::@arg0} : memref<4x5xf32>
+  %2 = iree.placeholder for "interface buffer"
+         {binding = @legacy_io::@arg1} : memref<5x10xf32>
+  linalg.fill(%0, %cst) : memref<4x10xf32>, f32
+  scf.parallel (%arg0, %arg1) = (%c0, %c0) to (%c4, %c10) step (%c8, %c8) {
+    scf.for %arg2 = %c0 to %c5 step %c4 {
+      ...
+      %5 = subview %1[%arg0, %arg2]...
+      ...
+      %8 = subview %2[%arg2, %arg1]...
+      ...
+      %11 = subview %0[%arg0, %arg1]..
+      linalg.matmul {__internal_linalg_transform__ = "workgroup"} %5, %8, %11...
+    }
+    scf.yield
+  }
+  return
+}
+```
+
+<a name="snippet7"></a> Snippet 7 : `linalg.matmul` after tiling.
+
+#### Tile Size and Workgroup Size
+
+When operations that are to be tiled exist within the dispatch function (like
+`linalg.matmul` or `linalg.conv`), this pass also decides the 1. Tile size to be
+used for the tiling. 1. The workgroup size to be used.
+
+The tile size and workgroup size are closely linked since the code within the
+tiled loops are to be collectively executed by the entire workgroup. In other
+words, all workitems in the workgroup collaborate to execute the tiled
+`linalg.matmul`.
+
+__Roadmap Note__ : Currently the tile sizes used in this pass are hard-wired.
+Not much effort has been put into finding ideal tile size for each operation on
+different hardware. The value used is meant to be a baseline to test
+functionality, with performance considerations addressed over time.
+
+#### Markers
+
+Downstream passes have to handle tiled Linalg operations and untiled Linalg
+operation that might exist in the same function in different ways. For example,
+while the former are to be executed collectively by workitems within a
+workgroup, the latter have to be executed by all workitems across workgroups.
+One way to distinguish these two operations is to use the marker mechanism in
+Linalg ([LinalgMarker][LinalgTilingPatterns]). This is a `StrAttr` whose value
+can be used to encode the scope of the operation. For example, in Snippet 7
+above, the tiled `linalg.matmul` operation has a marker `workgroup` to indicate
+that this operation needs to be executed by a workgroup in a collective manner.
+At this time, the code-generation pipeline uses only the `workgroup` marker.
+
+__Roadmap Note__ : Markers are meant to be short-lived, ideally set and consumed
+within the same pass. In the current pipeline the lifetime spans passes to allow
+lowering to different hierarchies. The separate passes that implement the
+lowering from Linalg to SPIR-V can be combined into a single pass, relying A ->
+B -> C translation mechanism of the Dialect Conversion framework to implement
+the progressive lowering. In interest of separation of concerns and for better
+debuggability these passes are kept separate at the cost of having lifetimes of
+markers span passes.
+
+#### Promoting subviews to use workgroup local memory and use of synchronizations
+
+`Workgroup` memory (or `shared memory` in CUDA terminology) can be used to
+prefetch the inputs to the tiled operation. For example in the matrix-matrix
+multiply case, the same data row (column) of the LHS (RHS) matrix is read by
+multiple workitems. Prefetching the data into `Workgroup` memory can reduce the
+number of loads to `StorageClass` memory by an order of magnitude. This
+transformation can be achieved by using the
+[`Linalg Promotion`][LinalgPromotionPatterns] which modifies the `subview`s that
+are the operands to the tiled Linalg operation to use a new `memref` object. The
+size of this `memref` is computed from the size of the `subview`. This `memref`
+object is later lowered to use `Workgroup` memory Storage Class. The snippet
+below shows this transformation when applied to `linalg.matmul` (along with
+tiling). The newly created `memref` objects are annotated with the memory space
+`3` to indicate that they are to be lowered to use `Workgroup` memory. The copy
+of data from the original `memref` into the new `memref`, as well as the
+necessary synchronization constructs are generated as well. Note the memory
+space annotation used here is consistent with what
+[address space annotations used in NVVM][NVVMAddressSpace].
+
+```mlir
+func @matmul_tile()
+  attributes {
+    spv.entry_point_abi = {local_size = dense<[8, 8, 1]> : vector<3xi32>}} {
+  %c96 = constant 96 : index
+  %c4 = constant 4 : index
+  %c8 = constant 8 : index
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %0 = iree.placeholder for "interface buffer"
+         {binding = @legacy_io::@arg0} : memref<96x96xf32>
+  %1 = iree.placeholder for "interface buffer"
+         {binding = @legacy_io::@arg1} : memref<96x96xf32>
+  %2 = iree.placeholder for "interface buffer"
+         {binding = @legacy_io::@ret0} : memref<96x96xf32>
+  scf.parallel (%arg0, %arg1) = (%c0, %c0) to (%c96, %c96) step (%c8, %c8) {
+    scf.for %arg2 = %c0 to %c96 step %c4 {
+      ...
+      %5 = subview %0[%arg0, %arg2]...
+      ...
+      %8 = subview %1[%arg2, %arg1]...
+      ...
+      %11 = subview %2[%arg0, %arg1]...
+      %12 = alloc(%c8, %c4) : memref<?x?xf32, 3>
+      %13 = subview %12[%c0, %c0]...
+      %14 = alloc(%c4, %c8) : memref<?x?xf32, 3>
+      %15 = subview %14[%c0, %c0]...
+      linalg.copy(%5, %13) {__internal_linalg_transform__ = "workgroup"}
+        : memref<?x?xf32, #map2>, memref<?x?xf32, #map2, 3>
+      spv.ControlBarrier "Workgroup", "Workgroup", "AcquireRelease"
+      linalg.copy(%8, %15) {__internal_linalg_transform__ = "workgroup"}
+        : memref<?x?xf32, #map2>, memref<?x?xf32, #map2, 3>
+      spv.ControlBarrier "Workgroup", "Workgroup", "AcquireRelease"
+      linalg.matmul {__internal_linalg_transform__ = "workgroup"} %13, %15, %11...
+      spv.ControlBarrier "Workgroup", "Workgroup", "AcquireRelease"
+      dealloc %12 : memref<?x?xf32, 3>
+      dealloc %14 : memref<?x?xf32, 3>
+    }
+    scf.yield
+  }
+  return
+}
+```
+
+<a name="snippet8"></a> Snippet 8: `linalg.matmul` after tiling and promotion of
+operand subviews to use `Workgroup` memory.
+
+### Distributing to workgroups and workitems
+
+After tiling the operations within the dispatch functions are either
+`scf.parallel` operations or Linalg operations.
+
+-   The outer `scf.parallel` operations represent parallel loops that are to be
+    distributed across workgroups. The distribution here assumes that the number
+    of workgroups along each dimension is equal to the number of iterations of
+    the `scf.parallel` operation.
+
+-   Linalg operations that are not tiled, and are therefore __not within__ `scf`
+    operations, are lowered to loops. The resulting outer `scf.parallel`
+    operations are collapsed to have a single induction variable. This loop is
+    then distributed across workitems using their `GlobalInvocationId`, (which
+    is same as `blockIdx * blockDim + threadIdx` in CUDA terminology).
+
+-   Linalg operations that are tiled, and are therefore __within__ `scf`
+    operations, are lowered to loops and the iterations of the `scf.parallel`
+    operations are mapped to workitems using their `LocalInvocationId` (which is
+    same as `threadIdx` in CUDA terminology). Note that these operations are
+    tagged with the `workgroup` marker which makes it easy to disambiguate from
+    the case where Linalg operations are outside of `scf` operations. Here too,
+    the distribution assumes that the workgroup size is greater than or equal to
+    the number of iterations of the partitioned loop.
+
+These transformations are applied by the [`ConvertToGPUPass`][ConvertToGPU].
+Below is the result of applying this pass to Snippet 7. The outer `scf.parallel`
+loop is distributed across workgroups. The tiled `linalg.matmul` operation is
+lowered to loops, and the outer `scf.parallel` operation generated during this
+lowering are distributed across workitems within the workgroup.
+
+```mlir
+func @main_ex_dispatch_0_dispatch_1()
+  attributes {
+    spv.entry_point_abi = {local_size = dense<[8, 8, 1]> : vector<3xi32>}} {
+  %c5 = constant 5 : index
+  %c8 = constant 8 : index
+  %c4 = constant 4 : index
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %0 = iree.placeholder for "interface buffer"
+         {binding = @legacy_io::@ret0} : memref<4x10xf32>
+  %1 = iree.placeholder for "interface buffer"
+         {binding = @legacy_io::@arg0} : memref<4x5xf32>
+  %2 = iree.placeholder for "interface buffer"
+         {binding = @legacy_io::@arg1} : memref<5x10xf32>
+  %3 = "gpu.block_id"() {dimension = "x"} : () -> index
+  %4 = muli %3, %c8 : index
+  scf.for %arg0 = %c0 to %c5 step %c4 {
+    ...
+    %9 = subview %1[0, %arg0]
+    ...
+    %14 = subview %2[%arg0, %4]
+    %15 = subview %0[0, %4]
+    %16 = "gpu.thread_id"() {dimension = "x"} : () -> index
+    %17 = "gpu.thread_id"() {dimension = "y"} : () -> index
+    %18 = cmpi "slt", %17, %c4 : index
+    %19 = cmpi "slt", %16, %13 : index
+    %20 = and %18, %19 : i1
+    scf.if %20 {
+      scf.for %arg1 = %c0 to %8 step %c1 {
+        %21 = load %9[%17, %arg1] : memref<4x?xf32, #map0>
+        %22 = load %14[%arg1, %16] : memref<?x?xf32, #map1>
+        %23 = load %15[%17, %16] : memref<4x?xf32, #map1>
+        %24 = mulf %21, %22 : f32
+        %25 = addf %23, %24 : f32
+        store %25, %15[%17, %16] : memref<4x?xf32, #map1>
+      }
+    }
+  }
+  return
+}
+```
+
+<a name="snippet9"></a> Snippet 9: `linalg.matmul` after distributing parallel
+inter-tile loops to workgroups and intra-tile loops to workitems.
+
+[Snippet 6](#snippet6) shows the fused element-wise operations represented using
+a `linalg.generic` operation. This operation is not tiled in the
+`LinalgTileAndFusePass`. So the `ConvertToGPUPass` lowers this operation to
+`scf.parallel` loops, which are collapsed into a `scf.parallel` operation with a
+single induction variable. This loop is then distributed across workitems using
+the `GlobalInvocationId`. The resulting IR is shown below.
+
+```mlir
+func @main_ex_dispatch_0()
+  attributes {
+    spv.entry_point_abi = {local_size = dense<[32, 1, 1]> : vector<3xi32>}} {
+  %c50 = constant 50 : index
+  %c5 = constant 5 : index
+  %0 = iree.placeholder for "interface buffer"
+         {binding = @legacy_io::@ret0} : memref<10x5xf32>
+  %1 = iree.placeholder for "interface buffer"
+         {binding = @legacy_io::@arg0} : memref<10x5xf32>
+  %2 = iree.placeholder for "interface buffer"
+         {binding = @legacy_io::@arg1} : memref<10x5xf32>
+  %3 = iree.placeholder for "interface buffer"
+         {binding = @legacy_io::@arg2} : memref<10x5xf32>
+  %4 = "gpu.block_id"() {dimension = "x"} : () -> index
+  %5 = "gpu.block_dim"() {dimension = "x"} : () -> index
+  %6 = "gpu.thread_id"() {dimension = "x"} : () -> index
+  %7 = muli %4, %5 : index
+  %8 = addi %7, %6 : index
+  %9 = cmpi "slt", %8, %c50 : index
+  scf.if %9 {
+    %10 = divi_signed %8, %c5 : index
+    %11 = remi_signed %8, %c5 : index
+    %12 = load %1[%10, %11] : memref<10x5xf32>
+    %13 = load %2[%10, %11] : memref<10x5xf32>
+    %14 = load %3[%10, %11] : memref<10x5xf32>
+    %15 = addf %12, %13 : f32
+    %16 = mulf %15, %14 : f32
+    store %16, %0[%10, %11] : memref<10x5xf32>
+  }
+  return
+}
+```
+
+<a name="snippet10"></a> Snippet 10: Distributing the iterations for pointwise
+operations for GPU execution.
+
+### Lowering to SPIR-V dialect
+
+The last step is to take the result of the previous pass and lowering it to
+SPIR-V dialect. Since SPIR-V dialect is *closed*, i.e. it has a separate type
+system, its best to lower all the operations to SPIR-V in one step. This is done
+by applying all the patterns that lower all the different IR constructs into
+SPIR-V within the [`ConvertToSPIRVPass`][ConvertToSPIRV]. These are
+
+-   [GPU dialect to SPIR-V conversion][GPUToSPIRV].
+-   [SCF dialect to SPIR-V conversion][SCFToSPIRV].
+-   [Standard dialect to SPIR-V conversion][StandardToSPIRV].
+-   Patterns that lower the `iree.placeholder` instruction into a SPIR-V.
+
+Once applied the resulting IR is in SPIR-V dialect that can be serialized to a
+SPIR-V binary.
+
+[ConvertToGPU]: https://github.com/google/iree/blob/main/iree/compiler/Conversion/LinalgToSPIRV/ConvertToGPUPass.cpp
+[ConvertToSPIRV]: https://github.com/google/iree/blob/main/iree/compiler/Conversion/LinalgToSPIRV/ConvertToSPIRVPass.cpp
+[DotAfterAll]: https://gist.github.com/MaheshRavishankar/9e2d406296f469515c4a79bf1e7eef44
+[GPUToSPIRV]: https://github.com/llvm/llvm-project/blob/master/mlir/include/mlir/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.h
+[HLOToLinalgPass]: https://github.com/tensorflow/tensorflow/blob/75c40f6bff2faa3d90a375dfa4025b2e6e2d7a3d/tensorflow/compiler/mlir/xla/transforms/passes.h#L67
+[LinalgDialect]: https://mlir.llvm.org/docs/Dialects/Linalg/
+[LinalgFusionOnBuffers]: https://github.com/llvm/llvm-project/blob/ef868a848e6def288d2df7a1b3ebe09463afc8d0/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h#L86
+[LinalgFusionOfTensorOps]: https://github.com/llvm/llvm-project/blob/80cb25cbd555f9634836b766c86aead435b60eaa/mlir/include/mlir/Dialect/Linalg/Passes.td#L30
+[LinalgPromotionPatterns]: https://github.com/llvm/llvm-project/blob/303a7f7a26e2aae1cb85f49dccbc0b5d14e0b2e0/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h#L358
+[LinalgRationale]: https://mlir.llvm.org/docs/Rationale/RationaleLinalgDialect/
+[LinalgTileAndFuse]: https://github.com/google/iree/blob/main/iree/compiler/Conversion/LinalgToSPIRV/LinalgTileAndFusePass.cpp
+[LinalgTiling]: https://mlir.llvm.org/docs/Dialects/Linalg/#set-of-key-transformationsa-namekey_transformationsa
+[LinalgTilingPatterns]: https://github.com/llvm/llvm-project/blob/master/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+[NVVMAddressSpace]: https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#address-space
+[PwAfterAll]: https://gist.github.com/MaheshRavishankar/02cdd22f7c99e568f933244b5a679510
+[SCFToSPIRV]: https://github.com/llvm/llvm-project/blob/master/mlir/include/mlir/Conversion/SCFToSPIRV/SCFToSPIRV.h
+[SpirvSerialization]: https://mlir.llvm.org/docs/Dialects/SPIR-V/#serialization-and-deserialization
+[StandardToSPIRV]: https://github.com/llvm/llvm-project/blob/master/mlir/include/mlir/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.h
diff --git a/docs/dynamic_shapes.md b/docs/design_docs/dynamic_shapes.md
similarity index 100%
rename from docs/dynamic_shapes.md
rename to docs/design_docs/dynamic_shapes.md
diff --git a/docs/function_abi.md b/docs/design_docs/function_abi.md
similarity index 99%
rename from docs/function_abi.md
rename to docs/design_docs/function_abi.md
index 797f2f6..71d91bb 100644
--- a/docs/function_abi.md
+++ b/docs/design_docs/function_abi.md
@@ -1,4 +1,4 @@
-# Function signatures
+# Function Signatures
 
 A key job of the IREE compiler and runtime is capturing function call semantics
 from the originating system and providing mechanisms so that invocations can be
diff --git a/docs/design_docs/hlo_to_linalg.png b/docs/design_docs/hlo_to_linalg.png
new file mode 100755
index 0000000..469ed26
--- /dev/null
+++ b/docs/design_docs/hlo_to_linalg.png
Binary files differ
diff --git a/docs/design_docs/linalg_to_spirv.png b/docs/design_docs/linalg_to_spirv.png
new file mode 100755
index 0000000..fd6aee7
--- /dev/null
+++ b/docs/design_docs/linalg_to_spirv.png
Binary files differ
diff --git a/docs/simple_ir_walkthrough.md b/docs/design_docs/simple_ir_walkthrough.md
similarity index 99%
rename from docs/simple_ir_walkthrough.md
rename to docs/design_docs/simple_ir_walkthrough.md
index 1dfcb49..68f51a8 100644
--- a/docs/simple_ir_walkthrough.md
+++ b/docs/design_docs/simple_ir_walkthrough.md
@@ -1,5 +1,7 @@
 # Simple IR Walkthrough
 
+Note that this doc is quite outdated. We expect to update it soon.
+
 ## Overview
 
 This walks through the process of lowering TensorFlow python to an IREE module,
diff --git a/docs/roadmap_design.md b/docs/design_roadmap.md
similarity index 100%
rename from docs/roadmap_design.md
rename to docs/design_roadmap.md
diff --git a/docs/benchmarking.md b/docs/developing_iree/benchmarking.md
similarity index 100%
rename from docs/benchmarking.md
rename to docs/developing_iree/benchmarking.md
diff --git a/docs/contributor_tips.md b/docs/developing_iree/contributor_tips.md
similarity index 89%
rename from docs/contributor_tips.md
rename to docs/developing_iree/contributor_tips.md
index 05fb52e..03ddffc 100644
--- a/docs/contributor_tips.md
+++ b/docs/developing_iree/contributor_tips.md
@@ -2,11 +2,11 @@
 
 This is an opinionated guide documenting workflows that some members of the team
 have found useful. It is focused on meta-tooling, not on IREE code specifically
-(you will find the latter in the [Developer Overview](../developer_overview.md))
-It is certainly possible to use workflows other than these, but some common
-tasks, especially for maintainers will likely be made easier if you use these
-flows. It assumes a basic knowledge of `git` and GitHub and suggests some
-specific ways of using it.
+(you will find the latter in the [Developer Overview](developer_overview.md)) It
+is certainly possible to use workflows other than these, but some common tasks,
+especially for maintainers will likely be made easier if you use these flows. It
+assumes a basic knowledge of `git` and GitHub and suggests some specific ways of
+using it.
 
 ## Git Structure
 
diff --git a/docs/developer_overview.md b/docs/developing_iree/developer_overview.md
similarity index 89%
rename from docs/developer_overview.md
rename to docs/developing_iree/developer_overview.md
index 56fb5f8..ec900f5 100644
--- a/docs/developer_overview.md
+++ b/docs/developing_iree/developer_overview.md
@@ -150,7 +150,7 @@
 and executes it as a series of
 [googletest](https://github.com/google/googletest) tests. This is the test
 runner for the IREE
-[check framework](https://github.com/google/iree/tree/main/docs/testing_guide.md#end-to-end-tests).
+[check framework](https://github.com/google/iree/tree/main/docs/developing_iree/testing_guide.md#end-to-end-tests).
 
 ```shell
 $ bazel run iree/tools:iree-translate -- \
@@ -207,6 +207,28 @@
 accept a number where 0, 1, 2, 3 stands for info, warning, error, and fatal
 error respectively.
 
+#### Read inputs from a file
+
+All the IREE tools support reading input values from a file. This is quite
+useful for debugging. Use `-help` for each tool to see what the flag to set. The
+inputs are expected to be newline-separated. Each input should be either a
+scalar or a buffer. Scalars should be in the format `type=value` and buffers
+should be in the format `[shape]xtype=[value]`. For example:
+
+```
+1x5xf32=1,-2,-3,4,-5
+1x5x3x1xf32=15,14,13,12,11,10,9,8,7,6,5,4,3,2,1
+```
+
+#### `iree-flow-trace-dispatch-tensors`
+
+This flag will enable tracing inputs and outputs for each dispatch function. It
+is easier to narrow down test cases, since IREE breaks a ML workload into
+multiple dispatch function. When the flag is on, IREE will insert trace points
+before and after each dispatch function. The first trace op is for inputs, and
+the second trace op is for outputs. There will be two events for one dispatch
+function.
+
 ### Useful Vulkan driver flags
 
 For IREE's Vulkan runtime driver, there are a few useful
diff --git a/docs/repository_management.md b/docs/developing_iree/repository_management.md
similarity index 100%
rename from docs/repository_management.md
rename to docs/developing_iree/repository_management.md
diff --git a/docs/testing_guide.md b/docs/developing_iree/testing_guide.md
similarity index 98%
rename from docs/testing_guide.md
rename to docs/developing_iree/testing_guide.md
index 3be690d..5f57c9c 100644
--- a/docs/testing_guide.md
+++ b/docs/developing_iree/testing_guide.md
@@ -30,7 +30,7 @@
 To use the Vulkan backend as test driver, you may need to select between a
 Vulkan implementation from SwiftShader and multiple Vulkan-capable hardware
 devices. This can be done via environment variables. See the
-[generic Vulkan setup](GetStarted/generic_vulkan_env_setup.md#useful-environment-variables)
+[generic Vulkan setup](get_started/generic_vulkan_env_setup.md#useful-environment-variables)
 page for details regarding these variables.
 
 For Bazel, you can persist the configuration in `user.bazelrc` to save typing.
diff --git a/docs/GetStarted/cmake_options_and_variables.md b/docs/get_started/cmake_options_and_variables.md
similarity index 91%
rename from docs/GetStarted/cmake_options_and_variables.md
rename to docs/get_started/cmake_options_and_variables.md
index 8e121fc..3f3dbdd 100644
--- a/docs/GetStarted/cmake_options_and_variables.md
+++ b/docs/get_started/cmake_options_and_variables.md
@@ -63,17 +63,21 @@
 
 #### `IREE_HAL_DRIVERS_TO_BUILD`:STRING
 
-*This does not have any effect at the moment, but will be supported in the
-future!* Semicolon-separated list of HAL drivers to build, or `all` for building
-all HAL drivers. Case-insensitive. Defaults to `all`. Example:
+*Righ now this only affects whether tests are enabled when compiling for
+Android; it will be fully supported in the future!*
+
+Semicolon-separated list of HAL drivers to build, or `all` for building all HAL
+drivers. Case-insensitive. Defaults to `all`. Example:
 `-DIREE_HAL_DRIVERS_TO_BUILD="Vulkan;VMLA"`.
 
 #### `IREE_TARGET_BACKENDS_TO_BUILD`:STRING
 
-*This does not have any effect at the moment, but will be supported in the
-future!* Semicolon-separated list of HAL drivers to build, or `all` for building
-all HAL drivers. Case-insensitive. Defaults to `all`. Example:
-`-DIREE_HAL_DRIVERS_TO_BUILD="Vulkan_SPIRV;VMLA"`.
+*Righ now this only affects whether tests are enabled when compiling for
+Android; it will be fully supported in the future!*
+
+Semicolon-separated list of HAL drivers to build, or `all` for building all
+compiler target backends. Case-insensitive. Defaults to `all`. Example:
+`-DIREE_HAL_DRIVERS_TO_BUILD="Vulkan-SPIRV;VMLA"`.
 
 #### `IREE_ENABLE_LLD`:BOOL
 
diff --git a/docs/GetStarted/generic_vulkan_env_setup.md b/docs/get_started/generic_vulkan_env_setup.md
similarity index 100%
rename from docs/GetStarted/generic_vulkan_env_setup.md
rename to docs/get_started/generic_vulkan_env_setup.md
diff --git a/docs/GetStarted/getting_started_android_cmake.md b/docs/get_started/getting_started_android_cmake.md
similarity index 100%
rename from docs/GetStarted/getting_started_android_cmake.md
rename to docs/get_started/getting_started_android_cmake.md
diff --git a/docs/GetStarted/getting_started_linux_bazel.md b/docs/get_started/getting_started_linux_bazel.md
similarity index 97%
rename from docs/GetStarted/getting_started_linux_bazel.md
rename to docs/get_started/getting_started_linux_bazel.md
index 8d4a9dc..7f8b688 100644
--- a/docs/GetStarted/getting_started_linux_bazel.md
+++ b/docs/get_started/getting_started_linux_bazel.md
@@ -123,7 +123,7 @@
 ### Further Reading
 
 *   For an introduction to IREE's project structure and developer tools, see
-    [Developer Overview](../developer_overview.md)
+    [Developer Overview](../developing_iree/developer_overview.md)
 *   To target GPUs using Vulkan, see
     [Getting Started on Linux with Vulkan](getting_started_linux_vulkan.md)
 *   To use IREE's Python bindings, see
diff --git a/docs/GetStarted/getting_started_linux_cmake.md b/docs/get_started/getting_started_linux_cmake.md
similarity index 97%
rename from docs/GetStarted/getting_started_linux_cmake.md
rename to docs/get_started/getting_started_linux_cmake.md
index 70da146..127a80c 100644
--- a/docs/GetStarted/getting_started_linux_cmake.md
+++ b/docs/get_started/getting_started_linux_cmake.md
@@ -110,7 +110,7 @@
 ### Further Reading
 
 *   For an introduction to IREE's project structure and developer tools, see
-    [Developer Overview](../developer_overview.md)
+    [Developer Overview](../developing_iree/developer_overview.md)
 *   To target GPUs using Vulkan, see
     [Getting Started on Linux with Vulkan](getting_started_linux_vulkan.md)
 *   To use IREE's Python bindings, see
diff --git a/docs/GetStarted/getting_started_linux_vulkan.md b/docs/get_started/getting_started_linux_vulkan.md
similarity index 100%
rename from docs/GetStarted/getting_started_linux_vulkan.md
rename to docs/get_started/getting_started_linux_vulkan.md
diff --git a/docs/GetStarted/getting_started_macos_bazel.md b/docs/get_started/getting_started_macos_bazel.md
similarity index 95%
rename from docs/GetStarted/getting_started_macos_bazel.md
rename to docs/get_started/getting_started_macos_bazel.md
index 2b285eb..3fc3dcb 100644
--- a/docs/GetStarted/getting_started_macos_bazel.md
+++ b/docs/get_started/getting_started_macos_bazel.md
@@ -126,8 +126,8 @@
 ### Further Reading
 
 *   For an introduction to IREE's project structure and developer tools, see
-    [Developer Overview](../developer_overview.md) <!-- TODO: Link to macOS
-    versions of these guides once they are developed.
+    [Developer Overview](../developing_iree/developer_overview.md) <!-- TODO:
+    Link to macOS versions of these guides once they are developed.
 *   To target GPUs using Vulkan, see
     [Getting Started on Linux with Vulkan](getting_started_linux_vulkan.md)
 *   To use IREE's Python bindings, see
diff --git a/docs/GetStarted/getting_started_macos_cmake.md b/docs/get_started/getting_started_macos_cmake.md
similarity index 95%
rename from docs/GetStarted/getting_started_macos_cmake.md
rename to docs/get_started/getting_started_macos_cmake.md
index 51ef0ab..7b916cd 100644
--- a/docs/GetStarted/getting_started_macos_cmake.md
+++ b/docs/get_started/getting_started_macos_cmake.md
@@ -110,8 +110,8 @@
 ### Further Reading
 
 *   For an introduction to IREE's project structure and developer tools, see
-    [Developer Overview](../developer_overview.md) <!-- TODO: Link to macOS
-    versions of these guides once they are developed.
+    [Developer Overview](../developing_iree/developer_overview.md) <!-- TODO:
+    Link to macOS versions of these guides once they are developed.
 *   To target GPUs using Vulkan, see
     [Getting Started on Linux with Vulkan](getting_started_linux_vulkan.md)
 *   To use IREE's Python bindings, see
diff --git a/docs/GetStarted/getting_started_python.md b/docs/get_started/getting_started_python.md
similarity index 100%
rename from docs/GetStarted/getting_started_python.md
rename to docs/get_started/getting_started_python.md
diff --git a/docs/GetStarted/getting_started_windows_bazel.md b/docs/get_started/getting_started_windows_bazel.md
similarity index 97%
rename from docs/GetStarted/getting_started_windows_bazel.md
rename to docs/get_started/getting_started_windows_bazel.md
index d3f01c3..8cf0f87 100644
--- a/docs/GetStarted/getting_started_windows_bazel.md
+++ b/docs/get_started/getting_started_windows_bazel.md
@@ -118,7 +118,7 @@
 ### Further Reading
 
 *   For an introduction to IREE's project structure and developer tools, see
-    [Developer Overview](../developer_overview.md)
+    [Developer Overview](../developing_iree/developer_overview.md)
 *   To target GPUs using Vulkan, see
     [Getting Started on Windows with Vulkan](getting_started_windows_vulkan.md)
 *   To use IREE's Python bindings, see
diff --git a/docs/GetStarted/getting_started_windows_cmake.md b/docs/get_started/getting_started_windows_cmake.md
similarity index 97%
rename from docs/GetStarted/getting_started_windows_cmake.md
rename to docs/get_started/getting_started_windows_cmake.md
index dcab418..da5218f 100644
--- a/docs/GetStarted/getting_started_windows_cmake.md
+++ b/docs/get_started/getting_started_windows_cmake.md
@@ -107,7 +107,7 @@
 ### Further Reading
 
 *   For an introduction to IREE's project structure and developer tools, see
-    [Developer Overview](../developer_overview.md)
+    [Developer Overview](../developing_iree/developer_overview.md)
 *   To target GPUs using Vulkan, see
     [Getting Started on Windows with Vulkan](getting_started_windows_vulkan.md)
 *   To use IREE's Python bindings, see
diff --git a/docs/GetStarted/getting_started_windows_vulkan.md b/docs/get_started/getting_started_windows_vulkan.md
similarity index 100%
rename from docs/GetStarted/getting_started_windows_vulkan.md
rename to docs/get_started/getting_started_windows_vulkan.md
diff --git a/docs/IREE-Architecture.svg b/docs/iree_architecture.svg
similarity index 100%
rename from docs/IREE-Architecture.svg
rename to docs/iree_architecture.svg
diff --git a/docs/roadmap.md b/docs/milestones.md
similarity index 94%
rename from docs/roadmap.md
rename to docs/milestones.md
index b979de7..322fa7e 100644
--- a/docs/roadmap.md
+++ b/docs/milestones.md
@@ -1,11 +1,11 @@
-# IREE Roadmap
+# IREE Milestones
 
 ## Design
 
 Though many of the core dialects are now in place enough for correctness testing
 a large majority of the features we are most excited to demonstrate are still
 TODO and will be coming over the next few quarters. You can find a highlighted
-set of coming features in the [design roadmap](roadmap_design.md).
+set of coming features in the [design roadmap](design_roadmap.md).
 
 ## Spring/Summer 2020 Focus Areas
 
@@ -37,7 +37,7 @@
 ### HAL: Marl CPU Scheduling
 
 We want to plug in [marl](https://github.com/google/marl) to provide
-[CPU-side work scheduling](roadmap_design.md#gpu-like-cpu-scheduling) that
+[CPU-side work scheduling](design_roadmap.md#gpu-like-cpu-scheduling) that
 matches GPU semantics. This will enable improved CPU utilization and allow us to
 verify the approach with benchmarks.
 
diff --git a/docs/mnist_example.md b/docs/mnist_example.md
deleted file mode 100644
index 93cd23a..0000000
--- a/docs/mnist_example.md
+++ /dev/null
@@ -1,254 +0,0 @@
-# MNIST IR Example
-
-This shows the MNIST MLP model as it is compiled from Keras, lowered to XLA HLO,
-and then lowered to an IREE module with SPIR-V. Several steps are omitted for
-brevity.
-
-## TensorFlow Keras Model
-
-```python
-def simple_mnist_model(input_shape):
-  """Creates a simple (multi-layer perceptron) MNIST model."""
-  model = tf.keras.models.Sequential()
-  # Flatten to a 1d array (e.g. 28x28 -> 784)
-  model.add(tf.keras.layers.Flatten(input_shape=input_shape))
-  # Fully-connected neural layer with 128 neurons, RELU activation
-  model.add(tf.keras.layers.Dense(128, activation='relu'))
-  # Fully-connected neural layer returning probability scores for each class
-  model.add(tf.keras.layers.Dense(10, activation='softmax'))
-  return model
-```
-
-## XLA HLO
-
-**NOTE**: this uses placeholder weights to keep the page from being a few
-thousand lines of floats.
-
-```mlir
-module {
-  func @main(%arg0: tensor<1x28x28x1xf32>) -> tuple<tensor<1x10xf32>>
-  attributes {iree.module.export} {
-    %cst = constant  {name = "constant.9"} dense<0.5> : tensor<f32>
-    %0 = "mhlo.broadcast_in_dim"(%cst) {name = "broadcast.10"} : (tensor<f32>) -> tensor<1x128xf32>
-    %1 = "mhlo.copy"(%arg0) {name = "copy.1"} : (tensor<1x28x28x1xf32>) -> tensor<1x28x28x1xf32>
-    %2 = "mhlo.reshape"(%1) {name = "reshape.2"} : (tensor<1x28x28x1xf32>) -> tensor<1x28x28x1xf32>
-    %3 = "mhlo.reshape"(%2) {name = "reshape.3"} : (tensor<1x28x28x1xf32>) -> tensor<1x784xf32>
-    %cst_0 = constant  {name = "constant.4"} dense<0.5> : tensor<784x128xf32>
-    %4 = "mhlo.dot"(%3, %cst_0) {name = "dot.5", precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<1x784xf32>, tensor<784x128xf32>) -> tensor<1x128xf32>
-    %cst_1 = constant  {name = "constant.6"} dense<0.5> : tensor<128xf32>
-    %5 = "mhlo.broadcast_in_dim"(%cst_1) {broadcast_dimensions = dense<1> : tensor<1xi64>, name = "broadcast.7"} : (tensor<128xf32>) -> tensor<1x128xf32>
-    %6 = "mhlo.add"(%4, %5) {name = "add.8"} : (tensor<1x128xf32>, tensor<1x128xf32>) -> tensor<1x128xf32>
-    %7 = "mhlo.maximum"(%0, %6) {name = "maximum.11"} : (tensor<1x128xf32>, tensor<1x128xf32>) -> tensor<1x128xf32>
-    %cst_2 = constant  {name = "constant.12"} dense<0.5> : tensor<128x10xf32>
-    %8 = "mhlo.dot"(%7, %cst_2) {name = "dot.13", precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<1x128xf32>, tensor<128x10xf32>) -> tensor<1x10xf32>
-    %cst_3 = constant  {name = "constant.14"} dense<0.5> : tensor<10xf32>
-    %9 = "mhlo.broadcast_in_dim"(%cst_3) {broadcast_dimensions = dense<1> : tensor<1xi64>, name = "broadcast.15"} : (tensor<10xf32>) -> tensor<1x10xf32>
-    %10 = "mhlo.add"(%8, %9) {name = "add.16"} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
-    %cst_4 = constant  {name = "constant.17"} dense<0xFF800000> : tensor<f32>
-    %11 = "mhlo.reduce"(%10, %cst_4) ( {
-    ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):   // no predecessors
-      %20 = "mhlo.maximum"(%arg1, %arg2) {name = "maximum.21"} : (tensor<f32>, tensor<f32>) -> tensor<f32>
-      "mhlo.return"(%20) : (tensor<f32>) -> ()
-    }) {dimensions = dense<1> : tensor<1xi64>} : (tensor<1x10xf32>, tensor<f32>) -> tensor<1xf32>
-    %12 = "mhlo.broadcast_in_dim"(%11) {broadcast_dimensions = dense<0> : tensor<1xi64>, name = "broadcast.23"} : (tensor<1xf32>) -> tensor<1x10xf32>
-    %13 = "mhlo.subtract"(%10, %12) {name = "subtract.24"} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
-    %14 = "mhlo.exponential"(%13) {name = "exponential.25"} : (tensor<1x10xf32>) -> tensor<1x10xf32>
-    %cst_5 = constant  {name = "constant.27"} dense<0.5> : tensor<f32>
-    %15 = "mhlo.reduce"(%14, %cst_5) ( {
-    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):   // no predecessors
-      %21 = "mhlo.add"(%arg3, %arg4) {name = "add.31"} : (tensor<f32>, tensor<f32>) -> tensor<f32>
-      "mhlo.return"(%21) : (tensor<f32>) -> ()
-    }) {dimensions = dense<1> : tensor<1xi64>} : (tensor<1x10xf32>, tensor<f32>) -> tensor<1xf32>
-    %16 = "mhlo.broadcast_in_dim"(%15) {broadcast_dimensions = dense<0> : tensor<1xi64>, name = "broadcast.34"} : (tensor<1xf32>) -> tensor<1x10xf32>
-    %17 = "mhlo.divide"(%14, %16) {name = "divide.35"} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
-    %18 = "mhlo.reshape"(%17) {name = "reshape.36"} : (tensor<1x10xf32>) -> tensor<1x10xf32>
-    %19 = "mhlo.tuple"(%18) {name = "tuple.37"} : (tensor<1x10xf32>) -> tuple<tensor<1x10xf32>>
-    return %19 : tuple<tensor<1x10xf32>>
-  }
-}
-```
-
-## IREE IR (pre-backend lowering)
-
-Here's the lowered, outlined, and compiler-annotated version of the above in the
-IREE sequencer dialect.
-
-```mlir
-module {
-  iree.multi_arch_executable @main_ex_dispatch_0[0]() {
-    iree.executable[0](Unspecified) {
-      module {
-        func @main_entry_dispatch_0(%arg0: memref<1x28x28x1xf32>, %arg1: memref<1x784xf32>)
-  attributes  {iree.executable.export, iree.executable.workload = dense<[784, 1, 1]> : tensor<3xi32>, iree.ordinal = 0 : i32} {
-          %0 = iree.load_input(%arg0 : memref<1x28x28x1xf32>) : tensor<1x28x28x1xf32>
-          %1 = "mhlo.copy"(%0) {name = "copy.1"} : (tensor<1x28x28x1xf32>) -> tensor<1x28x28x1xf32>
-          %2 = "mhlo.reshape"(%1) {name = "reshape.3"} : (tensor<1x28x28x1xf32>) -> tensor<1x784xf32>
-          iree.store_output(%2 : tensor<1x784xf32>, %arg1 : memref<1x784xf32>)
-          iree.return
-        }
-      }
-    }
-  }
-  iree.multi_arch_executable @main_ex_dispatch_1[1]() {
-    iree.executable[1](Unspecified) {
-      module {
-        func @main_entry_dispatch_1(%arg0: memref<1x784xf32>, %arg1: memref<784x128xf32>, %arg2: memref<1x128xf32>)
-  attributes  {iree.executable.export, iree.executable.workload = dense<[128, 1, 1]> : tensor<3xi32>, iree.ordinal = 0 : i32} {
-          %0 = iree.load_input(%arg0 : memref<1x784xf32>) : tensor<1x784xf32>
-          %1 = iree.load_input(%arg1 : memref<784x128xf32>) : tensor<784x128xf32>
-          %2 = "mhlo.dot"(%0, %1) {name = "dot.5", precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<1x784xf32>, tensor<784x128xf32>) -> tensor<1x128xf32>
-          iree.store_output(%2 : tensor<1x128xf32>, %arg2 : memref<1x128xf32>)
-          iree.return
-        }
-      }
-    }
-  }
-  iree.multi_arch_executable @main_ex_dispatch_2[2]() {
-    iree.executable[2](Unspecified) {
-      module {
-        func @main_entry_dispatch_2(%arg0: memref<1x128xf32>, %arg1: memref<1x128xf32>)
-  attributes  {iree.executable.export, iree.executable.workload = dense<[128, 1, 1]> : tensor<3xi32>, iree.ordinal = 0 : i32} {
-          %0 = iree.load_input(%arg0 : memref<1x128xf32>) : tensor<1x128xf32>
-          %cst = constant dense<5.000000e-01> : tensor<128xf32>
-          %cst_0 = constant dense<5.000000e-01> : tensor<f32>
-          %1 = "mhlo.broadcast_in_dim"(%cst_0) {name = "broadcast.10"} : (tensor<f32>) -> tensor<1x128xf32>
-          %2 = "mhlo.broadcast_in_dim"(%cst) {broadcast_dimensions = dense<1> : tensor<1xi64>, name = "broadcast.7"} : (tensor<128xf32>) -> tensor<1x128xf32>
-          %3 = addf %0, %2 : tensor<1x128xf32>
-          %4 = mhlo.maximum %1, %3 {name = "maximum.11"} : tensor<1x128xf32>
-          iree.store_output(%4 : tensor<1x128xf32>, %arg1 : memref<1x128xf32>)
-          iree.return
-        }
-      }
-    }
-  }
-  iree.multi_arch_executable @main_ex_dispatch_3[3]() {
-    iree.executable[3](Unspecified) {
-      module {
-        func @main_entry_dispatch_3(%arg0: memref<1x128xf32>, %arg1: memref<128x10xf32>, %arg2: memref<1x10xf32>)
-  attributes  {iree.executable.export, iree.executable.workload = dense<[10, 1, 1]> : tensor<3xi32>, iree.ordinal = 0 : i32} {
-          %0 = iree.load_input(%arg0 : memref<1x128xf32>) : tensor<1x128xf32>
-          %1 = iree.load_input(%arg1 : memref<128x10xf32>) : tensor<128x10xf32>
-          %2 = "mhlo.dot"(%0, %1) {name = "dot.13", precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<1x128xf32>, tensor<128x10xf32>) -> tensor<1x10xf32>
-          iree.store_output(%2 : tensor<1x10xf32>, %arg2 : memref<1x10xf32>)
-          iree.return
-        }
-      }
-    }
-  }
-  iree.multi_arch_executable @main_ex_dispatch_4[4]() {
-    iree.executable[4](Unspecified) {
-      module {
-        func @main_entry_dispatch_4(%arg0: memref<1x10xf32>, %arg1: memref<1x10xf32>)
-  attributes  {iree.executable.export, iree.executable.workload = dense<[10, 1, 1]> : tensor<3xi32>, iree.ordinal = 0 : i32} {
-          %0 = iree.load_input(%arg0 : memref<1x10xf32>) : tensor<1x10xf32>
-          %cst = constant dense<5.000000e-01> : tensor<10xf32>
-          %1 = "mhlo.broadcast_in_dim"(%cst) {broadcast_dimensions = dense<1> : tensor<1xi64>, name = "broadcast.15"} : (tensor<10xf32>) -> tensor<1x10xf32>
-          %2 = addf %0, %1 : tensor<1x10xf32>
-          iree.store_output(%2 : tensor<1x10xf32>, %arg1 : memref<1x10xf32>)
-          iree.return
-        }
-      }
-    }
-  }
-  iree.multi_arch_executable @main_ex_dispatch_5[5]() {
-    iree.executable[5](Unspecified) {
-      module {
-        func @main_entry_dispatch_5(%arg0: memref<1x10xf32>, %arg1: memref<1xf32>)
-  attributes  {iree.executable.export, iree.executable.workload = dense<1> : tensor<3xi32>, iree.ordinal = 0 : i32} {
-          %0 = iree.load_input(%arg0 : memref<1x10xf32>) : tensor<1x10xf32>
-          %cst = constant dense<0xFF800000> : tensor<f32>
-          %1 = "mhlo.reduce"(%0, %cst) ( {
-          ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>): // no predecessors
-            %2 = mhlo.maximum %arg2, %arg3 {name = "maximum.21"} : tensor<f32>
-            "mhlo.return"(%2) : (tensor<f32>) -> ()
-          }) {dimensions = dense<1> : tensor<1xi64>} : (tensor<1x10xf32>, tensor<f32>) -> tensor<1xf32>
-          iree.store_output(%1 : tensor<1xf32>, %arg1 : memref<1xf32>)
-          iree.return
-        }
-      }
-    }
-  }
-  iree.multi_arch_executable @main_ex_dispatch_6[6]() {
-    iree.executable[6](Unspecified) {
-      module {
-        func @main_entry_dispatch_6(%arg0: memref<1x10xf32>, %arg1: memref<1xf32>, %arg2: memref<1x10xf32>)
-  attributes  {iree.executable.export, iree.executable.workload = dense<[10, 1, 1]> : tensor<3xi32>, iree.ordinal = 0 : i32} {
-          %0 = iree.load_input(%arg0 : memref<1x10xf32>) : tensor<1x10xf32>
-          %1 = iree.load_input(%arg1 : memref<1xf32>) : tensor<1xf32>
-          %2 = "mhlo.broadcast_in_dim"(%1) {broadcast_dimensions = dense<0> : tensor<1xi64>, name = "broadcast.23"} : (tensor<1xf32>) -> tensor<1x10xf32>
-          %3 = subf %0, %2 : tensor<1x10xf32>
-          %4 = "mhlo.exponential"(%3) {name = "exponential.25"} : (tensor<1x10xf32>) -> tensor<1x10xf32>
-          iree.store_output(%4 : tensor<1x10xf32>, %arg2 : memref<1x10xf32>)
-          iree.return
-        }
-      }
-    }
-  }
-  iree.multi_arch_executable @main_ex_dispatch_7[7]() {
-    iree.executable[7](Unspecified) {
-      module {
-        func @main_entry_dispatch_7(%arg0: memref<1x10xf32>, %arg1: memref<1xf32>)
-  attributes  {iree.executable.export, iree.executable.workload = dense<1> : tensor<3xi32>, iree.ordinal = 0 : i32} {
-          %0 = iree.load_input(%arg0 : memref<1x10xf32>) : tensor<1x10xf32>
-          %cst = constant dense<5.000000e-01> : tensor<f32>
-          %1 = "mhlo.reduce"(%0, %cst) ( {
-          ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>): // no predecessors
-            %2 = addf %arg2, %arg3 : tensor<f32>
-            "mhlo.return"(%2) : (tensor<f32>) -> ()
-          }) {dimensions = dense<1> : tensor<1xi64>} : (tensor<1x10xf32>, tensor<f32>) -> tensor<1xf32>
-          iree.store_output(%1 : tensor<1xf32>, %arg1 : memref<1xf32>)
-          iree.return
-        }
-      }
-    }
-  }
-  iree.multi_arch_executable @main_ex_dispatch_8[8]() {
-    iree.executable[8](Unspecified) {
-      module {
-        func @main_entry_dispatch_8(%arg0: memref<1xf32>, %arg1: memref<1x10xf32>, %arg2: memref<1x10xf32>)
-  attributes  {iree.executable.export, iree.executable.workload = dense<[10, 1, 1]> : tensor<3xi32>, iree.ordinal = 0 : i32} {
-          %0 = iree.load_input(%arg0 : memref<1xf32>) : tensor<1xf32>
-          %1 = iree.load_input(%arg1 : memref<1x10xf32>) : tensor<1x10xf32>
-          %2 = "mhlo.broadcast_in_dim"(%0) {broadcast_dimensions = dense<0> : tensor<1xi64>, name = "broadcast.34"} : (tensor<1xf32>) -> tensor<1x10xf32>
-          %3 = divf %1, %2 : tensor<1x10xf32>
-          iree.store_output(%3 : tensor<1x10xf32>, %arg2 : memref<1x10xf32>)
-          iree.return
-        }
-      }
-    }
-  }
-  func @main(%arg0: memref<1x28x28x1xf32>) -> memref<1x10xf32>
-  attributes  {iree.module.export} {
-    %0 = "iree_ll_seq.constant"() {value = dense<5.000000e-01> : tensor<784x128xf32>} : () -> memref<784x128xf32>
-    %1 = "iree_ll_seq.constant"() {value = dense<5.000000e-01> : tensor<128x10xf32>} : () -> memref<128x10xf32>
-    %2 = "iree_ll_seq.alloc_heap"() : () -> memref<1x784xf32>
-    iree_ll_seq.static_dispatch main_ex_dispatch_0::main_entry_dispatch_0[dense<[784, 1, 1]> : tensor<3xi32>](%arg0, %2) : (memref<1x28x28x1xf32>, memref<1x784xf32>) -> ()
-    %3 = "iree_ll_seq.alloc_heap"() : () -> memref<1x128xf32>
-    iree_ll_seq.static_dispatch main_ex_dispatch_1::main_entry_dispatch_1[dense<[128, 1, 1]> : tensor<3xi32>](%2, %0, %3) : (memref<1x784xf32>, memref<784x128xf32>, memref<1x128xf32>) -> ()
-    %4 = "iree_ll_seq.alloc_heap"() : () -> memref<1x128xf32>
-    iree_ll_seq.static_dispatch main_ex_dispatch_2::main_entry_dispatch_2[dense<[128, 1, 1]> : tensor<3xi32>](%3, %4) : (memref<1x128xf32>, memref<1x128xf32>) -> ()
-    %5 = "iree_ll_seq.alloc_heap"() : () -> memref<1x10xf32>
-    iree_ll_seq.static_dispatch main_ex_dispatch_3::main_entry_dispatch_3[dense<[10, 1, 1]> : tensor<3xi32>](%4, %1, %5) : (memref<1x128xf32>, memref<128x10xf32>, memref<1x10xf32>) -> ()
-    %6 = "iree_ll_seq.alloc_heap"() : () -> memref<1x10xf32>
-    iree_ll_seq.static_dispatch main_ex_dispatch_4::main_entry_dispatch_4[dense<[10, 1, 1]> : tensor<3xi32>](%5, %6) : (memref<1x10xf32>, memref<1x10xf32>) -> ()
-    %7 = "iree_ll_seq.alloc_heap"() : () -> memref<1xf32>
-    iree_ll_seq.static_dispatch main_ex_dispatch_5::main_entry_dispatch_5[dense<1> : tensor<3xi32>](%6, %7) : (memref<1x10xf32>, memref<1xf32>) -> ()
-    %8 = "iree_ll_seq.alloc_heap"() : () -> memref<1x10xf32>
-    iree_ll_seq.static_dispatch main_ex_dispatch_6::main_entry_dispatch_6[dense<[10, 1, 1]> : tensor<3xi32>](%6, %7, %8) : (memref<1x10xf32>, memref<1xf32>, memref<1x10xf32>) -> ()
-    %9 = "iree_ll_seq.alloc_heap"() : () -> memref<1xf32>
-    iree_ll_seq.static_dispatch main_ex_dispatch_7::main_entry_dispatch_7[dense<1> : tensor<3xi32>](%8, %9) : (memref<1x10xf32>, memref<1xf32>) -> ()
-    %10 = "iree_ll_seq.alloc_heap"() : () -> memref<1x10xf32>
-    iree_ll_seq.static_dispatch main_ex_dispatch_8::main_entry_dispatch_8[dense<[10, 1, 1]> : tensor<3xi32>](%9, %8, %10) : (memref<1xf32>, memref<1x10xf32>, memref<1x10xf32>) -> ()
-    iree_ll_seq.return %10 : memref<1x10xf32>
-  }
-}
-```
-
-**NOTE**: this is effectively compiling in -O0, which is why the buffers are not
-aliased and some dispatch region fusing is not performed. As we get things going
-we'll be adding simple optimizations that can operate on this IR to elide almost
-all copies and externalize allocations to transient pooled memory.
-
-## Final IREE Module with SPIR-V
-
-TODO(benvanik): once reductions are done.
diff --git a/docs/using_colab.md b/docs/using_iree/using_colab.md
similarity index 100%
rename from docs/using_colab.md
rename to docs/using_iree/using_colab.md
diff --git a/experimental/ModelBuilder/MemRefUtils.h b/experimental/ModelBuilder/MemRefUtils.h
index ebd6d1b..cb03bde 100644
--- a/experimental/ModelBuilder/MemRefUtils.h
+++ b/experimental/ModelBuilder/MemRefUtils.h
@@ -44,8 +44,8 @@
 #include <functional>
 #include <memory>
 
-#include "llvm/ADT/Optional.h"
-#include "mlir/ExecutionEngine/CRunnerUtils.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/ADT/Optional.h"
+#include "third_party/llvm/llvm-project/mlir/include/mlir/ExecutionEngine/CRunnerUtils.h"
 
 #ifndef IREE_EXPERIMENTAL_MODELBUILDER_MEMREFUTILS_H_
 #define IREE_EXPERIMENTAL_MODELBUILDER_MEMREFUTILS_H_
@@ -74,20 +74,53 @@
 
 // Mallocs a StridedMemRefDescriptor<T, N>* that matches the MLIR ABI.
 // This is an implementation detail that is kept in sync with MLIR codegen
+// conventions.  Additionally takes a `shapeAlloc` array which
+// is used instead of `shape` to allocate "more aligned" data and compute the
+// corresponding strides.
+template <typename T, int N>
+typename std::enable_if<(N >= 1), StridedMemRefType<T, N> *>::type
+makeStridedMemRefDescriptor(void *ptr, void *alignedPtr,
+                            const std::array<int64_t, N> &shape,
+                            const std::array<int64_t, N> &shapeAlloc,
+                            AllocFunType allocFun = &::malloc) {
+  StridedMemRefType<T, N> *descriptor = static_cast<StridedMemRefType<T, N> *>(
+      allocFun(sizeof(StridedMemRefType<T, N>)));
+  descriptor->basePtr = static_cast<T *>(ptr);
+  descriptor->data = static_cast<T *>(alignedPtr);
+  descriptor->offset = 0;
+  std::copy(shape.begin(), shape.end(), descriptor->sizes);
+  auto strides = makeStrides<N>(shapeAlloc);
+  std::copy(strides.begin(), strides.end(), descriptor->strides);
+  return descriptor;
+}
+
+// Mallocs a StridedMemRefDescriptor<T, N>* that matches the MLIR ABI.
+// This is an implementation detail that is kept in sync with MLIR codegen
 // conventions.
 template <typename T, int N>
 typename std::enable_if<(N >= 1), StridedMemRefType<T, N> *>::type
 makeStridedMemRefDescriptor(void *ptr, void *alignedPtr,
                             const std::array<int64_t, N> &shape,
-                            AllocFunType alloc = &::malloc) {
-  StridedMemRefType<T, N> *descriptor = static_cast<StridedMemRefType<T, N> *>(
-      alloc(sizeof(StridedMemRefType<T, N>)));
+                            AllocFunType allocFun = &::malloc) {
+  return makeStridedMemRefDescriptor(ptr, alignedPtr, shape, shape, allocFun);
+}
+
+// Mallocs a StridedMemRefDescriptor<T, 0>* that matches the MLIR ABI.
+// This is an implementation detail that is kept in sync with MLIR codegen
+// conventions.  Additionally takes a `shapeAlloc` array which
+// is used instead of `shape` to allocate "more aligned" data and compute the
+// corresponding strides.
+template <typename T, int N>
+typename std::enable_if<(N == 0), StridedMemRefType<T, 0> *>::type
+makeStridedMemRefDescriptor(void *ptr, void *alignedPtr,
+                            const std::array<int64_t, N> &shape = {},
+                            const std::array<int64_t, N> &shapeAlloc = {},
+                            AllocFunType allocFun = &::malloc) {
+  StridedMemRefType<T, 0> *descriptor = static_cast<StridedMemRefType<T, 0> *>(
+      allocFun(sizeof(StridedMemRefType<T, 0>)));
   descriptor->basePtr = static_cast<T *>(ptr);
   descriptor->data = static_cast<T *>(alignedPtr);
   descriptor->offset = 0;
-  std::copy(shape.begin(), shape.end(), descriptor->sizes);
-  auto strides = makeStrides<N>(shape);
-  std::copy(strides.begin(), strides.end(), descriptor->strides);
   return descriptor;
 }
 
@@ -98,13 +131,8 @@
 typename std::enable_if<(N == 0), StridedMemRefType<T, 0> *>::type
 makeStridedMemRefDescriptor(void *ptr, void *alignedPtr,
                             const std::array<int64_t, N> &shape = {},
-                            AllocFunType alloc = &::malloc) {
-  StridedMemRefType<T, 0> *descriptor = static_cast<StridedMemRefType<T, 0> *>(
-      alloc(sizeof(StridedMemRefType<T, 0>)));
-  descriptor->basePtr = static_cast<T *>(ptr);
-  descriptor->data = static_cast<T *>(alignedPtr);
-  descriptor->offset = 0;
-  return descriptor;
+                            AllocFunType allocFun = &::malloc) {
+  return makeStridedMemRefDescriptor(ptr, alignedPtr, shape, shape, allocFun);
 }
 
 // Mallocs an UnrankedMemRefType<T>* that contains a ranked
@@ -113,9 +141,9 @@
 template <typename T, int N>
 ::UnrankedMemRefType<T> *allocUnrankedDescriptor(
     void *data, void *alignedData, const std::array<int64_t, N> &shape,
-    AllocFunType alloc = &::malloc) {
+    AllocFunType allocFun = &::malloc) {
   ::UnrankedMemRefType<T> *res = static_cast<::UnrankedMemRefType<T> *>(
-      alloc(sizeof(::UnrankedMemRefType<T>)));
+      allocFun(sizeof(::UnrankedMemRefType<T>)));
   res->rank = N;
   res->descriptor = makeStridedMemRefDescriptor<T, N>(data, alignedData, shape);
   return res;
@@ -157,14 +185,14 @@
 // and greater than the size of T. By default the alignment is sizeof(T).
 template <typename T>
 std::pair<void *, void *> allocAligned(
-    size_t nElements, AllocFunType alloc = &::malloc,
+    size_t nElements, AllocFunType allocFun = &::malloc,
     llvm::Optional<uint64_t> alignment = llvm::Optional<uint64_t>()) {
   assert(sizeof(T) < (1ul << 32) && "Elemental type overflows");
   auto size = nElements * sizeof(T);
   auto desiredAlignment = alignment.getValueOr(pow2msb(sizeof(T)));
   assert((desiredAlignment & (desiredAlignment - 1)) == 0);
   assert(desiredAlignment >= sizeof(T));
-  void *data = alloc(size + desiredAlignment);
+  void *data = allocFun(size + desiredAlignment);
   uintptr_t addr = reinterpret_cast<uintptr_t>(data);
   uintptr_t rem = addr % desiredAlignment;
   void *alignedData =
@@ -175,43 +203,48 @@
 }
 
 // Entry point to allocate a dense buffer with a given `shape` and initializer
-// of type PointwiseInitializer. Can optionally take specific `alloc` and `free`
-// functions.
+// of type PointwiseInitializer. Additionally takes a `shapeAlloc` array which
+// is used instead of `shape` to allocate "more aligned" data and compute the
+// corresponding strides.
+// Can optionally take specific alloc and free functions.
+//
+// Example:
+// When called with `shape = [128, 127]` and `shapeAlloc = [128, 128]`, this
+// allocates a memref with `128*128*sizeof(T)` bytes, `sizes = [128, 127]` and
+// `strides = [128, 1]`.
 template <typename T, int N, typename FreeFunType = decltype(&::free)>
-std::unique_ptr<::UnrankedMemRefType<float>, FreeFunType>
-makeInitializedUnrankedDescriptor(
-    const std::array<int64_t, N> &shape, LinearInitializer<T> init,
+std::unique_ptr<StridedMemRefType<T, N>, FreeFunType>
+makeInitializedStridedMemRefDescriptor(
+    const std::array<int64_t, N> &shape,
+    const std::array<int64_t, N> &shapeAlloc, LinearInitializer<T> init,
     llvm::Optional<uint64_t> alignment = llvm::Optional<uint64_t>(),
-    AllocFunType alloc = &::malloc, FreeFunType freeFun = &::free) {
+    AllocFunType allocFun = &::malloc, FreeFunType freeFun = &::free) {
+  for (unsigned i = 0; i < N; ++i)
+    assert(shape[i] <= shapeAlloc[i] &&
+           "shapeAlloc must be greater than or equal to shape");
   int64_t nElements = 1;
-  for (int64_t s : shape) nElements *= s;
-  auto allocated = allocAligned<T>(nElements, alloc, alignment);
+  for (int64_t s : shapeAlloc) nElements *= s;
+  auto allocated = allocAligned<T>(nElements, allocFun, alignment);
   auto *data = static_cast<T *>(allocated.first);
   auto *alignedData = static_cast<T *>(allocated.second);
   for (unsigned i = 0; i < nElements; ++i) init(i, alignedData);
-  return std::unique_ptr<::UnrankedMemRefType<float>, FreeFunType>(
-      detail::allocUnrankedDescriptor<T, N>(data, alignedData, shape), freeFun);
+  return std::unique_ptr<StridedMemRefType<T, N>, FreeFunType>(
+      detail::makeStridedMemRefDescriptor<T, N>(data, alignedData, shape,
+                                                shapeAlloc, allocFun),
+      freeFun);
 }
 
 // Entry point to allocate a dense buffer with a given `shape` and initializer
-// of type PointwiseInitializer. Can optionally take specific `alloc` and `free`
+// of type PointwiseInitializer. Can optionally take specific alloc and free
 // functions.
 template <typename T, int N, typename FreeFunType = decltype(&::free)>
 std::unique_ptr<StridedMemRefType<T, N>, FreeFunType>
 makeInitializedStridedMemRefDescriptor(
     const std::array<int64_t, N> &shape, LinearInitializer<T> init,
     llvm::Optional<uint64_t> alignment = llvm::Optional<uint64_t>(),
-    AllocFunType alloc = &::malloc, FreeFunType freeFun = &::free) {
-  int64_t nElements = 1;
-  for (int64_t s : shape) nElements *= s;
-  auto allocated = allocAligned<T>(nElements, alloc, alignment);
-  auto *data = static_cast<T *>(allocated.first);
-  auto *alignedData = static_cast<T *>(allocated.second);
-  for (unsigned i = 0; i < nElements; ++i) init(i, alignedData);
-  return std::unique_ptr<StridedMemRefType<T, N>, FreeFunType>(
-      detail::makeStridedMemRefDescriptor<T, N>(data, alignedData, shape,
-                                                alloc),
-      freeFun);
+    AllocFunType allocFun = &::malloc, FreeFunType freeFun = &::free) {
+  return makeInitializedStridedMemRefDescriptor<T, N>(
+      shape, shape, init, alignment, allocFun, freeFun);
 }
 
 }  // namespace mlir
diff --git a/experimental/ModelBuilder/ModelRunner.h b/experimental/ModelBuilder/ModelRunner.h
index 3539dbe..8afb37a 100644
--- a/experimental/ModelBuilder/ModelRunner.h
+++ b/experimental/ModelBuilder/ModelRunner.h
@@ -116,7 +116,10 @@
   }
   // Direct invocation based on MemRefType which automatically packs the data.
   template <typename... Args>
-  llvm::Error invoke(StringRef funcName, Args &... args) {
+  // TODO(suderman): Re-enable clang-format when new version migrates.
+  // clang-format off
+  llvm::Error invoke(StringRef funcName, Args &...args) {
+    // clang-format on
     const std::string adapterName =
         std::string("_mlir_ciface_") + funcName.str();
     void *argsArray[] = {getData(args)...};
diff --git a/integrations/tensorflow/bindings/python/pyiree/tf/support/tf_test_utils.py b/integrations/tensorflow/bindings/python/pyiree/tf/support/tf_test_utils.py
index 9cc1c93..f2c6151 100644
--- a/integrations/tensorflow/bindings/python/pyiree/tf/support/tf_test_utils.py
+++ b/integrations/tensorflow/bindings/python/pyiree/tf/support/tf_test_utils.py
@@ -264,8 +264,7 @@
       return self.multi()
 
   reinitialized_modules = [
-      tf_utils.CompiledModule.from_existing(module)
-      for module in compiled_backends.values()
+      module.create_reinitialized() for module in compiled_backends.values()
   ]
   return VirtualBackendsClass(*reinitialized_modules)
 
@@ -366,10 +365,12 @@
     try:
       backends = get_backends()
       cls._compiled_backends_dict = {}
-      for backend in backends:
-        compiled_backend = tf_utils.CompiledModule.compile(
-            cls._module_class, backend, cls._exported_names, global_debug_dir)
-        cls._compiled_backends_dict[backend.name] = compiled_backend
+      for backend_info in backends:
+        compiled_backend = backend_info.CompiledModule(cls._module_class,
+                                                       backend_info,
+                                                       cls._exported_names,
+                                                       global_debug_dir)
+        cls._compiled_backends_dict[backend_info.name] = compiled_backend
     finally:
       # Disable crash reproducer (to avoid inadvertently overwriting this
       # path on a subsequent interaction).
diff --git a/integrations/tensorflow/bindings/python/pyiree/tf/support/tf_utils.py b/integrations/tensorflow/bindings/python/pyiree/tf/support/tf_utils.py
index 4ef66d1..46a3785 100644
--- a/integrations/tensorflow/bindings/python/pyiree/tf/support/tf_utils.py
+++ b/integrations/tensorflow/bindings/python/pyiree/tf/support/tf_utils.py
@@ -39,22 +39,32 @@
   np.random.seed(seed)
 
 
+def backends_to_str(target_backends):
+  """Creates a flattened and normalized string representing target_backends."""
+  normalized_backends = []
+  for backend in target_backends:
+    # Remove unusual characters and ensure names don't end or start in "_".
+    backend = re.sub("[^0-9a-zA-Z_]+", "_", backend)
+    normalized_backends.append(backend.strip("_"))
+  return "__".join(normalized_backends)
+
+
 def compile_tf_module(tf_module,
                       target_backends=(),
                       exported_names=(),
                       artifacts_dir=None):
   """Compiles a TensorFlow tf.Module and optionally saves compilation artifacts.
 
-  The artifact this creates is not callable. See IreeCompiledModule.compile(...)
-  for an API that returns a module that can be called without any further steps.
+  The artifact this creates is not callable. See IreeCompiledModule for an API
+  that returns a module that can be called without any further steps.
 
   If artifacts_dir is provided then the following artifacts will be saved:
     saved_model:
       A TF SavedModel directory containing the files used translate the
       tf.Module into an IREE module.
-    tf_input__backends.mlir:
+    tf_input.mlir:
       MLIR for the module in TF's input dialect.
-    iree_input__backends.mlir:
+    iree_input.mlir:
       The MLIR above translated to IREE via compiler.TF_IMPORT_PASS_PIPELINE.
     compiled__backends.vmfb:
       A VM FlatBuffer compiled to the target backends from the IREE MLIR above.
@@ -77,14 +87,6 @@
     # We break up the compilation here so we can save intermediary artifacts.
     compiler_context = compiler.Context()
 
-    if artifacts_dir is not None:
-      normalized_backends = []
-      for backend in target_backends:
-        # Remove unusual characters and ensure names don't end or start in "_".
-        backend = re.sub("[^0-9a-zA-Z_]+", "_", backend)
-        normalized_backends.append(backend.strip("_"))
-      backends_string = "__".join(normalized_backends)
-
     # Convert the tf_module into raw TF input MLIR.
     compiler_module = compiler.tf_load_saved_model(
         sm_path,
@@ -93,8 +95,7 @@
         pass_pipeline=())
 
     if artifacts_dir is not None:
-      tf_mlir_path = os.path.join(artifacts_dir,
-                                  f"tf_input__{backends_string}.mlir")
+      tf_mlir_path = os.path.join(artifacts_dir, "tf_input.mlir")
       logging.info("Saving raw TF input MLIR to: %s", tf_mlir_path)
       with open(tf_mlir_path, "w") as f:
         f.write(compiler_module.to_asm())
@@ -103,16 +104,15 @@
     compiler_module.run_pass_pipeline(compiler.TF_IMPORT_PASS_PIPELINE)
 
     if artifacts_dir is not None:
-      iree_mlir_path = os.path.join(artifacts_dir,
-                                    f"iree_input__{backends_string}.mlir")
+      iree_mlir_path = os.path.join(artifacts_dir, "iree_input.mlir")
       logging.info("Saving IREE input MLIR to: %s", iree_mlir_path)
       with open(iree_mlir_path, "w") as f:
         f.write(compiler_module.to_asm())
 
     compiled_module = compiler_module.compile(target_backends=target_backends)
     if artifacts_dir is not None:
-      compiled_path = os.path.join(artifacts_dir,
-                                   f"compiled__{backends_string}.vmfb")
+      compiled_name = f"compiled__{backends_to_str(target_backends)}.vmfb"
+      compiled_path = os.path.join(artifacts_dir, compiled_name)
       logging.info("Saving compiled IREE module to: %s", compiled_path)
       with open(compiled_path, "wb") as f:
         f.write(compiled_module)
@@ -133,51 +133,29 @@
 
 
 class CompiledModule(object):
-  """Base class for the TF and IREE compiled module facades."""
-
-  @staticmethod
-  def compile(module_class,
-              backend_info,
-              exported_names=(),
-              artifacts_dir=None):
-    """Compile a tf.Module using the CompiledModule subclass in backend_info.
-
-    Args:
-      module_class: the tf.Module subclass to compile.
-      backend_info: an element of BackendInfo corresponding to the backend to
-        compile to. If a TF 'backend' is provided then the module is wrapped in
-        a TfCompiledModule.
-      exported_names: an optional iterable of strings representing which of the
-        module_class's functions to compile. If exported_names is empty all
-        functions will be compiled.
-      artifacts_dir: an optional path to save compilation artifacts to.
-    """
-    compile = backend_info.CompiledModule.compile
-    return compile(module_class, backend_info, exported_names, artifacts_dir)
-
-  @staticmethod
-  def from_existing(module):
-    """Duplicates 'module' with the tf.Module's state without recompiling."""
-    # Use the backend_info attr to determine which subclass' constructor to use.
-    from_existing = module._backend_info.CompiledModule.from_existing
-    return from_existing(module)
+  """Base class for the TF and IREE compiled modules."""
 
   def __init__(self, module_class, backend_info, exported_names, artifacts_dir):
-    """Default constructor – use `compile` or `from_existing` instead."""
+    """Shared base constructor – not useful on its own."""
     self._module_class = module_class
     self._backend_info = backend_info
     self._exported_names = exported_names
     self._artifacts_dir = artifacts_dir
 
+  def create_reinitialized(self):
+    """Duplicates this module with its initial state without recompiling."""
+    raise NotImplementedError()
+
 
 class IreeCompiledModule(CompiledModule):
   """Iree compiled module."""
 
-  @staticmethod
-  def compile(module_class,
-              backend_info,
-              exported_names=(),
-              artifacts_dir=None):
+  def __init__(self,
+               module_class,
+               backend_info,
+               exported_names=[],
+               artifacts_dir=None,
+               _create_reinitialized_args=None):
     """Compile a tf.Module to the target backend in backend_info.
 
     Args:
@@ -189,30 +167,9 @@
         functions will be compiled.
       artifacts_dir: an optional path to save compilation artifacts to.
     """
-    return IreeCompiledModule(module_class, backend_info, exported_names,
-                              artifacts_dir)
-
-  @staticmethod
-  def from_existing(module):
-    """Duplicates 'module' with the tf.Module's state without recompiling."""
-    default_args = [
-        module._module_class, module._backend_info, module._exported_names,
-        module._artifacts_dir
-    ]
-    from_existing_args = [module._module_blob, module._module, module._config]
-    return IreeCompiledModule(*default_args, from_existing_args)
-
-  def __init__(self,
-               module_class,
-               backend_info,
-               exported_names,
-               artifacts_dir,
-               _from_existing_args=None):
-    """Default constructor – use `compile` or `from_existing` instead."""
     super().__init__(module_class, backend_info, exported_names, artifacts_dir)
 
-    if _from_existing_args is None:
-      # Called from IreeCompiledModule.compile(...)
+    if _create_reinitialized_args is None:
       self._module_blob = compile_tf_module(
           tf_module=module_class(),
           target_backends=backend_info.iree_compiler_targets,
@@ -221,13 +178,22 @@
       self._module = rt.VmModule.from_flatbuffer(self._module_blob)
       self._config = rt.Config(driver_name=backend_info.iree_driver)
     else:
-      # Called from IreeCompiledModule.from_existing(module)
-      self._module_blob, self._module, self._config = _from_existing_args
+      # Called from self.create_reinitialized()
+      self._module_blob, self._module, self._config = _create_reinitialized_args
 
     # Holds all of the module's mutable state.
     self._context = rt.SystemContext(
         modules=[self._module], config=self._config)
 
+  def create_reinitialized(self):
+    """Duplicates this module with its initial state without recompiling."""
+    default_args = [
+        self._module_class, self._backend_info, self._exported_names,
+        self._artifacts_dir
+    ]
+    create_reinitialized_args = [self._module_blob, self._module, self._config]
+    return IreeCompiledModule(*default_args, create_reinitialized_args)
+
   def __getattr__(self, attr):
     # Try to resolve it as a function.
     m = self._context.modules[self._module.name]
@@ -253,11 +219,11 @@
   normalize TensorFlow's output to Numpy.
   """
 
-  @staticmethod
-  def compile(module_class,
-              backend_info,
-              exported_names=(),
-              artifacts_dir=None):
+  def __init__(self,
+               module_class,
+               backend_info,
+               exported_names=[],
+               artifacts_dir=None):
     """Wrap a tf.Module in a TFCompiledModule facade.
 
     Args:
@@ -269,23 +235,14 @@
       artifacts_dir: an optional path to save compilation artifacts to. Has no
         effect for this subclass as nothing is compiled.
     """
-    return TfCompiledModule(module_class, backend_info, exported_names,
-                            artifacts_dir)
-
-  @staticmethod
-  def from_existing(module):
-    """Duplicates 'module's facade with the starting state of module_class."""
-    duplicate_module = TfCompiledModule(module._module_class,
-                                        module._backend_info,
-                                        module._exported_names,
-                                        module._artifacts_dir)
-    return duplicate_module
-
-  def __init__(self, module_class, backend_info, exported_names, artifacts_dir):
-    """Default constructor – use `compile` or `from_existing` instead."""
     super().__init__(module_class, backend_info, exported_names, artifacts_dir)
     self._tf_module = module_class()
 
+  def create_reinitialized(self):
+    """Duplicates this module with the starting state of module_class."""
+    return TfCompiledModule(self._module_class, self._backend_info,
+                            self._exported_names, self._artifacts_dir)
+
   def __getattr__(self, attr):
     # Try to resolve it as a function.
     exported = len(self._exported_names) == 0 or attr in self._exported_names
diff --git a/integrations/tensorflow/bindings/python/pyiree/tf/support/tf_utils_test.py b/integrations/tensorflow/bindings/python/pyiree/tf/support/tf_utils_test.py
index dcc3aec..b1d9adb 100644
--- a/integrations/tensorflow/bindings/python/pyiree/tf/support/tf_utils_test.py
+++ b/integrations/tensorflow/bindings/python/pyiree/tf/support/tf_utils_test.py
@@ -17,6 +17,7 @@
 import os
 import tempfile
 
+from absl import logging
 from absl.testing import parameterized
 from pyiree.tf.support import tf_utils
 import tensorflow as tf
@@ -52,7 +53,7 @@
       },
       {
           'testcase_name': 'multiple_backends',
-          'target_backends': ['vmla', 'llvm'],
+          'target_backends': ['vmla', 'llvm-ir'],
       },
   ])
   def test_artifact_saving(self, target_backends):
@@ -65,12 +66,14 @@
 
       artifacts_to_check = [
           'saved_model',
-          f'tf_input__{"__".join(target_backends)}.mlir',
-          f'iree_input__{"__".join(target_backends)}.mlir',
-          f'compiled__{"__".join(target_backends)}.vmfb',
+          'tf_input.mlir',
+          'iree_input.mlir',
+          f'compiled__{tf_utils.backends_to_str(target_backends)}.vmfb',
       ]
       for artifact in artifacts_to_check:
-        self.assertTrue(os.path.exists(os.path.join(artifacts_dir, artifact)))
+        artifact_path = os.path.join(artifacts_dir, artifact)
+        logging.info('Checking path: %s', artifact_path)
+        self.assertTrue(os.path.exists(artifact_path))
 
   @parameterized.named_parameters([
       {
@@ -83,15 +86,15 @@
       },
   ])
   def test_unaltered_state(self, backend_name):
-    info = tf_utils.BackendInfo.ALL[backend_name]
-    module = tf_utils.CompiledModule.compile(StatefulCountingModule, info)
+    backend_info = tf_utils.BackendInfo.ALL[backend_name]
+    module = backend_info.CompiledModule(StatefulCountingModule, backend_info)
 
     # Test that incrementing works properly.
     self.assertEqual([0.], module.get_count())
     module.increment()
     self.assertEqual([1.], module.get_count())
 
-    reinitialized_module = tf_utils.CompiledModule.from_existing(module)
+    reinitialized_module = module.create_reinitialized()
     # Test reinitialization.
     self.assertEqual([0.], reinitialized_module.get_count())
     # Test independent state.
diff --git a/integrations/tensorflow/compiler/BUILD b/integrations/tensorflow/compiler/BUILD
index 080ea20..722e04b 100644
--- a/integrations/tensorflow/compiler/BUILD
+++ b/integrations/tensorflow/compiler/BUILD
@@ -62,6 +62,7 @@
         "//iree/tools:iree_opt_main",
         "@org_tensorflow//tensorflow/compiler/mlir/tensorflow:tensorflow_dialect_registration",
         "@org_tensorflow//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
+        "@org_tensorflow//tensorflow/compiler/mlir/tensorflow:tf_saved_model_passes",
         "@org_tensorflow//tensorflow/compiler/mlir/xla:xla_legalize_tf",
     ],
 )
diff --git a/integrations/tensorflow/e2e/BUILD b/integrations/tensorflow/e2e/BUILD
index 2fd4545..d63b839 100644
--- a/integrations/tensorflow/e2e/BUILD
+++ b/integrations/tensorflow/e2e/BUILD
@@ -93,6 +93,7 @@
     "matrix_ops_test.py",
     "ring_buffer_test.py",  # TODO(b/148747011)
     "scatter_update_test.py",
+    "sliding_window_test.py",  # TODO(#2659)
     "strings_test.py",
 ]
 
diff --git a/integrations/tensorflow/e2e/README.md b/integrations/tensorflow/e2e/README.md
index 341aa56..04604d8 100644
--- a/integrations/tensorflow/e2e/README.md
+++ b/integrations/tensorflow/e2e/README.md
@@ -26,10 +26,10 @@
 ## Compiling `tf.Module`s
 
 Compatible TensorFlow modules can be compiled to specific IREE backends using
-`IreeCompiledModule.compile(...)`. This also optionally saves compilation
-artifacts to a specified directory. These artifacts include: MLIR across various
-lowerings, a TensorFlow SavedModel, and the compiled VM FlatBuffer. A basic
-example of creating and calling an `IreeCompiledModule` can be found in
+`IreeCompiledModule`. This also optionally saves compilation artifacts to a
+specified directory. These artifacts include: MLIR across various lowerings, a
+TensorFlow SavedModel, and the compiled VM FlatBuffer. A basic example of
+creating and calling an `IreeCompiledModule` can be found in
 [`tf_utils_test.py`](https://github.com/google/iree/blob/main/integrations/tensorflow/bindings/python/pyiree/tf/support/tf_utils_test.py)
 
 When using Keras models or tf.Modules with functions that IREE can't compile,
@@ -38,7 +38,7 @@
 ```python
 from pyiree.tf.support import tf_utils
 vmla_module = tf_utils.IreeCompiledModule(
-    constructor=KerasTFModuleClass,
+    module_class=KerasTFModuleClass,
     backend_info=tf_utils.BackendInfo.ALL['iree_vmla'],
     exported_names=['predict'])
 vmla_module.predict(...)
diff --git a/integrations/tensorflow/e2e/keras/vision_model_test.py b/integrations/tensorflow/e2e/keras/vision_model_test.py
index 4f3917e..54374ff 100644
--- a/integrations/tensorflow/e2e/keras/vision_model_test.py
+++ b/integrations/tensorflow/e2e/keras/vision_model_test.py
@@ -152,5 +152,6 @@
 
   tf.test.main()
 
+
 if __name__ == '__main__':
   app.run(main)
diff --git a/iree/base/BUILD b/iree/base/BUILD
index 052bd1d..8d6b446 100644
--- a/iree/base/BUILD
+++ b/iree/base/BUILD
@@ -435,6 +435,15 @@
     ],
 )
 
+cc_test(
+    name = "time_test",
+    srcs = ["time_test.cc"],
+    deps = [
+        ":time",
+        "//iree/testing:gtest_main",
+    ],
+)
+
 cc_library(
     name = "tracing",
     hdrs = ["tracing.h"],
diff --git a/iree/base/CMakeLists.txt b/iree/base/CMakeLists.txt
index a6dccb2..322bf43 100644
--- a/iree/base/CMakeLists.txt
+++ b/iree/base/CMakeLists.txt
@@ -524,6 +524,16 @@
   PUBLIC
 )
 
+iree_cc_test(
+  NAME
+    time_test
+  SRCS
+    "time_test.cc"
+  DEPS
+    ::time
+    iree::testing::gtest_main
+)
+
 if(${IREE_ENABLE_RUNTIME_TRACING})
   iree_cc_library(
     NAME
diff --git a/iree/base/api.h b/iree/base/api.h
index e1b5bdc..ba8ec9b 100644
--- a/iree/base/api.h
+++ b/iree/base/api.h
@@ -374,7 +374,7 @@
 // Like absl::Duration, represented as relative nanoseconds.
 typedef int64_t iree_duration_t;
 // Like absl::InfiniteDuration.
-#define IREE_DURATION_INFINITE INT64_MIN
+#define IREE_DURATION_INFINITE INT64_MAX
 // Like absl::ZeroDuration.
 #define IREE_DURATION_ZERO 0
 
diff --git a/iree/base/signature_mangle.h b/iree/base/signature_mangle.h
index 948dbb5..8afb4b5 100644
--- a/iree/base/signature_mangle.h
+++ b/iree/base/signature_mangle.h
@@ -156,7 +156,7 @@
 // -----------------------------------------------------------------------------
 
 // Mangles raw function signatures.
-// See function_abi.md.
+// See docs/design_docs/function_abi.md.
 class RawSignatureMangler {
  public:
   static SignatureBuilder ToFunctionSignature(const SignatureBuilder& inputs,
@@ -364,7 +364,8 @@
 // Mangles function signatures according to the Sip (Structured Index Path) V1
 // scheme.
 //
-// Mangler for the 'sip' ABI. See function_abi.md in the documentation.
+// Mangler for the 'sip' ABI. See docs/design_docs/function_abi.md in the
+// documentation.
 class SipSignatureMangler {
  public:
   enum class IndexMode {
@@ -443,7 +444,8 @@
 // Parser for signatures generated by SipSignatureMangler.
 // This uses a Visitor interface to walk either input or result structs.
 //
-// Mangler for the 'sip' ABI. See function_abi.md in the documentation.
+// Mangler for the 'sip' ABI. See docs/design_docs/function_abi.md in the
+// documentation.
 class SipSignatureParser {
  public:
   enum class StructType {
diff --git a/iree/base/time.h b/iree/base/time.h
index e88b3b8..e51a5b1 100644
--- a/iree/base/time.h
+++ b/iree/base/time.h
@@ -15,6 +15,9 @@
 #ifndef IREE_BASE_TIME_H_
 #define IREE_BASE_TIME_H_
 
+#include <type_traits>
+#include <utility>
+
 #include "iree/base/api.h"
 
 namespace iree {
@@ -43,7 +46,7 @@
     return !(lhs == rhs);
   }
   friend inline bool operator<(const ChronoType& lhs, const ChronoType& rhs) {
-    return rhs.value_ < lhs.value_;
+    return lhs.value_ < rhs.value_;
   }
   friend inline bool operator>(const ChronoType& lhs, const ChronoType& rhs) {
     return rhs < lhs;
diff --git a/iree/base/time_test.cc b/iree/base/time_test.cc
new file mode 100644
index 0000000..114cd4e
--- /dev/null
+++ b/iree/base/time_test.cc
@@ -0,0 +1,45 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/base/time.h"
+
+#include "iree/testing/gtest.h"
+
+namespace iree {
+namespace {
+
+TEST(Time, DurationComparisons) {
+  EXPECT_TRUE(Milliseconds(123) == Milliseconds(123));
+  EXPECT_FALSE(Milliseconds(123) == Milliseconds(456));
+  EXPECT_FALSE(Milliseconds(123) != Milliseconds(123));
+  EXPECT_TRUE(Milliseconds(123) != Milliseconds(456));
+
+  EXPECT_TRUE(Milliseconds(123) < Milliseconds(456));
+  EXPECT_FALSE(Milliseconds(123) > Milliseconds(456));
+  EXPECT_FALSE(Milliseconds(123) > Milliseconds(123));
+  EXPECT_FALSE(Milliseconds(123) < Milliseconds(123));
+
+  EXPECT_TRUE(Milliseconds(123) <= Milliseconds(123));
+  EXPECT_TRUE(Milliseconds(123) >= Milliseconds(123));
+  EXPECT_TRUE(Milliseconds(123) <= Milliseconds(456));
+  EXPECT_FALSE(Milliseconds(123) >= Milliseconds(456));
+}
+
+TEST(Time, DurationArithmetic) {
+  EXPECT_EQ(Milliseconds(150), Milliseconds(100) + Milliseconds(50));
+  EXPECT_EQ(Milliseconds(50), Milliseconds(100) - Milliseconds(50));
+}
+
+}  // namespace
+}  // namespace iree
diff --git a/iree/compiler/Conversion/HLOToLinalg/HLOToLinalgOnBuffers.cpp b/iree/compiler/Conversion/HLOToLinalg/HLOToLinalgOnBuffers.cpp
index 99f7eb1..90178fc 100644
--- a/iree/compiler/Conversion/HLOToLinalg/HLOToLinalgOnBuffers.cpp
+++ b/iree/compiler/Conversion/HLOToLinalg/HLOToLinalgOnBuffers.cpp
@@ -27,6 +27,7 @@
 #include "iree/compiler/Dialect/IREE/IR/IREEOps.h"
 #include "iree/compiler/Dialect/Shape/IR/ShapeOps.h"
 #include "llvm/ADT/APInt.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
@@ -444,7 +445,7 @@
       rewriter.getI64IntegerAttr(1),                    // args_out
       rewriter.getArrayAttr(indexingMaps),
       getParallelAndReductionIterAttrs(rewriter, nloops, nonParallelLoops),
-      /*doc=*/nullptr, /*library_call=*/nullptr);
+      /*doc=*/nullptr, /*library_call=*/nullptr, /*symbol_source=*/nullptr);
 
   // Add a block to the region.
   auto *region = &linalgOp.region();
@@ -523,11 +524,6 @@
 LogicalResult PadOpConversion::apply(
     mhlo::PadOp op, ArrayRef<Value> inputBuffers, ArrayRef<Value> resultBuffers,
     ConversionPatternRewriter &rewriter) const {
-  if (llvm::any_of(op.interior_padding().getValues<IntegerAttr>(),
-                   [](auto attr) { return attr.getInt() != 0; }))
-    return op.emitError(
-        "pad op with non-zero interiror_padding is not supported");
-
   mhlo::PadOp::Adaptor adaptor(inputBuffers);
   auto loc = op.getLoc();
 
@@ -535,84 +531,31 @@
   Value paddingVal =
       paddingConstVal
           ? rewriter.create<ConstantOp>(loc, paddingConstVal).getResult()
-          : adaptor.padding_value();
+          : rewriter.create<LoadOp>(loc, adaptor.padding_value());
 
-  auto operandType = adaptor.operand().getType().cast<ShapedType>();
-  int rank = operandType.getRank();
-
-  SmallVector<Attribute, 2> indexingMaps;
-  indexingMaps.emplace_back(getPadOpInputIndexingMap(op, rank, rewriter));
-  if (!paddingConstVal) {
-    indexingMaps.emplace_back(AffineMapAttr::get(
-        AffineMap::get(rank, /*symbolCount=*/0, rewriter.getContext())));
-  }
-  indexingMaps.emplace_back(AffineMapAttr::get(
-      AffineMap::getMultiDimIdentityMap(rank, rewriter.getContext())));
-
-  SmallVector<Type, 2> resultTypes = {};
-  SmallVector<Value, 2> linalgOpArgs = {adaptor.operand()};
-  if (!paddingConstVal) linalgOpArgs.push_back(adaptor.padding_value());
-  linalgOpArgs.push_back(resultBuffers[0]);
-  auto linalgOp = rewriter.create<linalg::IndexedGenericOp>(
-      loc, resultTypes, linalgOpArgs,
-      rewriter.getI64IntegerAttr(linalgOpArgs.size() - 1),  // args_in
-      rewriter.getI64IntegerAttr(1),                        // args_out
-      rewriter.getArrayAttr(indexingMaps),
-      getParallelAndReductionIterAttrs(rewriter, rank, /*nReduction=*/0),
-      /*doc=*/nullptr, /*library_call=*/nullptr);
-
-  // Add a block to the region.
-  auto *region = &linalgOp.region();
-  auto *block = rewriter.createBlock(region, region->end());
-  SmallVector<Type, 4> bodyArgTypes;
-  bodyArgTypes.append(rank, rewriter.getIndexType());
-  bodyArgTypes.append(linalgOpArgs.size(), operandType.getElementType());
-  block->addArguments(bodyArgTypes);
-  rewriter.setInsertionPointToEnd(block);
-
-  // If the `index` of the result at a particular dimension i, is d_i, check if
-  //
-  // (d_i >= edge_padding_low[i]) &&
-  // (d_i < (edge_padding_low[i] + operand_shape[i])).
-  //
-  // If true, then use the value of the operand, otherwise use the padding
-  // value.
   const auto &edgePaddingLow = op.edge_padding_low();
-  const auto &edgePaddingHigh = op.edge_padding_high();
-
-  Type indexType = rewriter.getIndexType();
-  Value cond = nullptr;
-  auto applyAndOp = [&](Value val) {
-    cond = cond ? rewriter.create<AndOp>(loc, cond, val) : val;
-  };
-  for (int i = 0; i < rank; ++i) {
-    Value dim = block->getArgument(i);
-    int64_t paddingLow = edgePaddingLow.getValue<IntegerAttr>(i).getInt();
-    int64_t paddingHigh = edgePaddingHigh.getValue<IntegerAttr>(i).getInt();
-    auto low = rewriter.create<ConstantOp>(
-        loc, indexType, rewriter.getIntegerAttr(indexType, paddingLow));
-
-    // d_i < (edge_padding_low[i] + operand_shape[i])
-    if (paddingLow != 0 && paddingHigh != 0) {
-      auto operandExtent = rewriter.create<DimOp>(loc, adaptor.operand(), i);
-      auto bound = rewriter.create<AddIOp>(loc, low, operandExtent);
-      auto checkUb =
-          rewriter.create<CmpIOp>(loc, CmpIPredicate::slt, dim, bound);
-      applyAndOp(checkUb);
-    }
-
-    if (paddingLow != 0) {
-      // d_i >= edge_padding_low[i]
-      auto checkLb = rewriter.create<CmpIOp>(loc, CmpIPredicate::sge, dim, low);
-      applyAndOp(checkLb);
-    }
+  const auto &interiorPadding = op.interior_padding();
+  SmallVector<Value, 3> offsets, sizes, strides;
+  for (auto it : llvm::enumerate(llvm::zip(edgePaddingLow, interiorPadding))) {
+    Value startIndex = rewriter.create<ConstantIndexOp>(
+        loc, std::get<0>(it.value()).getZExtValue());
+    offsets.push_back(startIndex);
+    Value size = rewriter.create<DimOp>(loc, resultBuffers[0], it.index());
+    sizes.push_back(size);
+    Value stride = rewriter.create<ConstantIndexOp>(
+        loc, std::get<1>(it.value()).getZExtValue() + 1);
+    strides.push_back(stride);
   }
-  Value inputVal = block->getArgument(rank);
-  if (!paddingConstVal) paddingVal = block->getArgument(rank + 1);
-  Value result =
-      cond ? rewriter.create<SelectOp>(loc, cond, inputVal, paddingVal)
-           : inputVal;
-  rewriter.create<linalg::YieldOp>(loc, result);
+
+  // TODO(hanchung): Move SubViewOp this down to before where it is used.
+  // The pass for splitting dispatch function for vulkan requires no other ops
+  // interleave with Linalg structured ops, so put the SubViewOp in the
+  // beginning.
+  auto subViewOp = rewriter.create<SubViewOp>(loc, resultBuffers[0], offsets,
+                                              sizes, strides);
+  rewriter.create<linalg::FillOp>(loc, resultBuffers[0], paddingVal);
+  rewriter.create<linalg::CopyOp>(loc, inputBuffers[0], subViewOp);
+
   return success();
 }
 
@@ -691,8 +634,9 @@
   int batch = op.batch_dims().getSExtValue();
   auto indexShapeType = adaptor.index().getType().dyn_cast<ShapedType>();
   int nIndices = indexShapeType.getRank();
-  if (batch < 0)
-    return op.emitError("expected batch_dims is greater than or equal to zero");
+  auto inputShapeType = adaptor.input().getType().dyn_cast<ShapedType>();
+  if (axis < 0) axis += inputShapeType.getRank();
+  if (batch < 0) batch += nIndices;
 
   Location loc = op.getLoc();
   Value output = op.getResult();
@@ -715,7 +659,7 @@
       rewriter.getI64IntegerAttr(1),  // args_out
       rewriter.getArrayAttr(indexingMaps),
       getParallelAndReductionIterAttrs(rewriter, rank, /*nReduction=*/0),
-      /*doc=*/nullptr, /*library_call=*/nullptr);
+      /*doc=*/nullptr, /*library_call=*/nullptr, /*symbol_source=*/nullptr);
 
   // Add a block to the region.
   auto *region = &linalgOp.region();
@@ -1019,7 +963,7 @@
       rewriter.getArrayAttr(indexingMaps),
       getParallelAndReductionIterAttrs(rewriter, nInputRank,
                                        reductionDims.size()),
-      /*doc=*/nullptr, /*library_call=*/nullptr);
+      /*doc=*/nullptr, /*library_call=*/nullptr, /*symbol_source=*/nullptr);
 
   linalgOp.region().takeBody(reduceOp.body());
   {
@@ -1095,7 +1039,8 @@
         op.getLoc(), ArrayRef<Type>(), opArgs, op.args_in(), op.args_out(),
         op.indexing_maps(), op.iterator_types(),
         /*doc=*/nullptr,
-        /*library_call=*/nullptr);
+        /*library_call=*/nullptr,
+        /*symbol_source=*/nullptr);
     // Move the region from the replaced op into the new op.
     unsigned numTensorOperands = op.getNumOperands();
     // indexed_generic op has arguments for each index. In the case of generic
diff --git a/iree/compiler/Conversion/HLOToLinalg/test/pad.mlir b/iree/compiler/Conversion/HLOToLinalg/test/pad.mlir
index a167ef4..8d1d96d 100644
--- a/iree/compiler/Conversion/HLOToLinalg/test/pad.mlir
+++ b/iree/compiler/Conversion/HLOToLinalg/test/pad.mlir
@@ -5,7 +5,8 @@
   func @pad_cst() {
     %c0 = constant 0 : index
     %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<12x4xf32>
-    // CHECK: linalg.indexed_generic
+    // CHECK: linalg.fill
+    // CHECK: linalg.copy
     %1 = constant dense<0.0> : tensor<f32>
     %2 = "mhlo.pad"(%0, %1) {
       edge_padding_high = dense<[2, 3]> : tensor<2xi64>,
@@ -29,7 +30,8 @@
     %c0 = constant 0 : index
     %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<12x4xf32>
     %1 = hal.interface.load.tensor @legacy_io::@arg1, offset = %c0 : tensor<f32>
-    // CHECK: linalg.indexed_generic
+    // CHECK: linalg.fill
+    // CHECK: linalg.copy
     %2 = "mhlo.pad"(%0, %1) {
       edge_padding_high = dense<[2, 3]> : tensor<2xi64>,
       edge_padding_low = dense<[4, 5]> : tensor<2xi64>,
@@ -52,7 +54,8 @@
   func @pad_no_op() {
     %c0 = constant 0 : index
     %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<12x4xf32>
-    // CHECK: linalg.indexed_generic
+    // CHECK: linalg.fill
+    // CHECK: linalg.copy
     %1 = constant dense<0.0> : tensor<f32>
     %2 = "mhlo.pad"(%0, %1) {
       edge_padding_high = dense<0> : tensor<2xi64>,
diff --git a/iree/compiler/Conversion/LinalgToLLVM/BUILD b/iree/compiler/Conversion/LinalgToLLVM/BUILD
index 41d9988..c6cafc1 100644
--- a/iree/compiler/Conversion/LinalgToLLVM/BUILD
+++ b/iree/compiler/Conversion/LinalgToLLVM/BUILD
@@ -22,13 +22,14 @@
     name = "LinalgToLLVM",
     srcs = [
         "ConvertToLLVM.cpp",
-        "HALInterfaceToMemrefArguments.cpp",
+        "MatMulVectorization.cpp",
         "Passes.cpp",
     ],
     hdrs = [
         "Passes.h",
     ],
     deps = [
+        "//iree/compiler/Conversion/CodegenUtils",
         "//iree/compiler/Conversion/HLOToLinalg",
         "//iree/compiler/Dialect/HAL/IR",
         "//iree/compiler/Dialect/HAL/IR:HALDialect",
@@ -46,6 +47,7 @@
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:StandardOpsTransforms",
         "@llvm-project//mlir:Transforms",
+        "@llvm-project//mlir:VectorOps",
         "@llvm-project//mlir:VectorToLLVM",
         "@llvm-project//mlir:VectorToSCF",
     ],
diff --git a/iree/compiler/Conversion/LinalgToLLVM/CMakeLists.txt b/iree/compiler/Conversion/LinalgToLLVM/CMakeLists.txt
index d21dc19..fddc144 100644
--- a/iree/compiler/Conversion/LinalgToLLVM/CMakeLists.txt
+++ b/iree/compiler/Conversion/LinalgToLLVM/CMakeLists.txt
@@ -21,7 +21,7 @@
     "Passes.h"
   SRCS
     "ConvertToLLVM.cpp"
-    "HALInterfaceToMemrefArguments.cpp"
+    "MatMulVectorization.cpp"
     "Passes.cpp"
   DEPS
     MLIRAffineToStandard
@@ -35,8 +35,10 @@
     MLIRStandardOpsTransforms
     MLIRStandardToLLVM
     MLIRTransforms
+    MLIRVector
     MLIRVectorToLLVM
     MLIRVectorToSCF
+    iree::compiler::Conversion::CodegenUtils
     iree::compiler::Conversion::HLOToLinalg
     iree::compiler::Dialect::HAL::IR
     iree::compiler::Dialect::HAL::IR::HALDialect
diff --git a/iree/compiler/Conversion/LinalgToLLVM/ConvertToLLVM.cpp b/iree/compiler/Conversion/LinalgToLLVM/ConvertToLLVM.cpp
index 193eb3c..777fb02 100644
--- a/iree/compiler/Conversion/LinalgToLLVM/ConvertToLLVM.cpp
+++ b/iree/compiler/Conversion/LinalgToLLVM/ConvertToLLVM.cpp
@@ -17,6 +17,7 @@
 #include "iree/compiler/Dialect/HAL/IR/HALOps.h"
 #include "iree/compiler/Dialect/IREE/IR/IREEOps.h"
 #include "iree/compiler/Dialect/Shape/IR/ShapeOps.h"
+#include "iree/compiler/Dialect/Shape/IR/ShapeTypes.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Conversion/LinalgToLLVM/LinalgToLLVM.h"
 #include "mlir/Conversion/SCFToStandard/SCFToStandard.h"
@@ -27,6 +28,7 @@
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Dialect/StandardOps/Transforms/Passes.h"
+#include "mlir/Dialect/Vector/VectorOps.h"
 #include "mlir/Pass/Pass.h"
 
 namespace mlir {
@@ -124,6 +126,180 @@
   }
 };
 
+/// Returns true if `aOp` has a desciptor (set, binding) pair smaller than
+/// `bOp`. Note that this ignores the offset.
+bool operator<(IREE::HAL::InterfaceBindingOp aOp,
+               IREE::HAL::InterfaceBindingOp bOp) {
+  if (aOp.set().getZExtValue() == bOp.set().getZExtValue())
+    return aOp.binding().getZExtValue() < bOp.binding().getZExtValue();
+  return aOp.set().getZExtValue() < bOp.set().getZExtValue();
+}
+
+// Change signature of entry function to func
+// entry_func(%packed_buffers_arg_ptr:
+// !<llvm.int8**>, %push_constant: !<llvm.int64*>) and lower IREE and HAL ops to
+// corresponding LLVMIR ops to construct memref descriptors and load
+// push_constant values.
+class ConvertFuncWithHALInterface : public ConvertToLLVMPattern {
+ public:
+  explicit ConvertFuncWithHALInterface(MLIRContext *context,
+                                       LLVMTypeConverter &typeconverter)
+      : ConvertToLLVMPattern(FuncOp::getOperationName(), context,
+                             typeconverter) {}
+
+  LogicalResult matchAndRewrite(
+      Operation *op, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const override {
+    if (SymbolTable::getSymbolVisibility(op) != SymbolTable::Visibility::Public)
+      return failure();
+    auto funcOp = dyn_cast_or_null<FuncOp>(op);
+    FunctionType fnType = funcOp.getType();
+    if (fnType.getNumInputs() != 0) {
+      return rewriter.notifyMatchFailure(
+          funcOp, "entry function should not have inputs");
+    }
+
+    // Get interface buffers from all the blocks.
+    SmallVector<IREE::PlaceholderOp, 8> bufferOps;
+    SmallVector<IREE::HAL::InterfaceLoadConstantOp, 8> loadOps;
+    for (Block &block : funcOp.getBlocks()) {
+      for (Operation &op : block) {
+        if (auto phOp = dyn_cast<IREE::PlaceholderOp>(op))
+          bufferOps.push_back(phOp);
+        if (auto phOp = dyn_cast<IREE::HAL::InterfaceLoadConstantOp>(op)) {
+          loadOps.push_back(phOp);
+        }
+      }
+    }
+
+    if (bufferOps.empty()) return failure();
+
+    // A map from buffer ops to their corresponding interface binding ops.
+    llvm::DenseMap<Operation *, IREE::HAL::InterfaceBindingOp> bufferBindingMap;
+    for (auto bufferOp : bufferOps) {
+      auto symbol = SymbolTable::lookupNearestSymbolFrom(
+          bufferOp, bufferOp.getAttrOfType<SymbolRefAttr>("binding"));
+      bufferBindingMap[bufferOp] = cast<IREE::HAL::InterfaceBindingOp>(symbol);
+    }
+
+    // Sort buffers according to their descriptor (set, binding) pair.
+    llvm::sort(bufferOps, [&bufferBindingMap](IREE::PlaceholderOp aBuffer,
+                                              IREE::PlaceholderOp bBuffer) {
+      return bufferBindingMap[aBuffer] < bufferBindingMap[bBuffer];
+    });
+
+    // A map from buffer ops to their corresponding function argument indices.
+    llvm::DenseMap<Operation *, unsigned> bufferArgMap;
+    // A map from binding ops to their corresponding function argument indices.
+    llvm::DenseMap<Operation *, unsigned> bindingArgMap;
+    llvm::SmallVector<MemRefType, 4> inputMemRefTypes;
+    llvm::SmallVector<LLVM::LLVMType, 4> inputStructPtrs;
+    unsigned argIndex = 0;
+    for (auto bufferOp : bufferOps) {
+      auto binding = bufferBindingMap[bufferOp];
+      auto it = bindingArgMap.find(binding);
+      if (it != bindingArgMap.end()) {
+        bufferArgMap[bufferOp] = it->second;
+      } else {
+        bindingArgMap[binding] = argIndex;
+        bufferArgMap[bufferOp] = argIndex;
+        ++argIndex;
+      }
+
+      auto memrefType = bufferOp.getType().dyn_cast_or_null<MemRefType>();
+      inputMemRefTypes.push_back(memrefType);
+      auto elementType = typeConverter.convertType(memrefType.getElementType())
+                             .dyn_cast<LLVM::LLVMType>();
+      if (!elementType) return failure();
+      inputStructPtrs.push_back(
+          elementType.getPointerTo(memrefType.getMemorySpace()));
+    }
+
+    TypeConverter::SignatureConversion signatureConverter(/*numOrigInputs=*/0);
+
+    // func foo(%packed_buffer_args: !llvm<i8**>, %push_constant: !llvm<i64*>)
+    auto packedBuffersArgsTy =
+        LLVM::LLVMType::getInt8PtrTy(typeConverter.getDialect()).getPointerTo();
+    auto pushConstantArgTy =
+        LLVM::LLVMType::getInt64Ty(typeConverter.getDialect()).getPointerTo();
+    signatureConverter.addInputs(packedBuffersArgsTy);
+    signatureConverter.addInputs(pushConstantArgTy);
+
+    // Create the new function's signature.
+    Location loc = funcOp.getLoc();
+    auto newFuncOp = rewriter.create<FuncOp>(
+        loc, funcOp.getName(),
+        rewriter.getFunctionType(signatureConverter.getConvertedTypes(),
+                                 llvm::None),
+        ArrayRef<NamedAttribute>());
+
+    // Move all ops in the old function's region to the new function.
+    rewriter.inlineRegionBefore(funcOp.getBody(), newFuncOp.getBody(),
+                                newFuncOp.end());
+    rewriter.applySignatureConversion(&newFuncOp.getBody(), signatureConverter);
+
+    auto builder = OpBuilder::atBlockBegin(&(newFuncOp.getBlocks().front()));
+
+    // Cast and unpack input packed_buffer_arguments and construct memref
+    // descriptors.
+    Value packedBuffersArgsPtr = builder.create<LLVM::BitcastOp>(
+        loc,
+        LLVM::LLVMType::getStructTy(typeConverter.getDialect(), inputStructPtrs)
+            .getPointerTo(),
+        newFuncOp.getArgument(0));
+    Value packedBuffersArgs =
+        builder.create<LLVM::LoadOp>(loc, packedBuffersArgsPtr);
+    for (auto bufferOp : bufferOps) {
+      MemRefType memrefType = bufferOp.getType().dyn_cast_or_null<MemRefType>();
+      if (!memrefType) return failure();
+      const auto index = bufferArgMap[bufferOp];
+      Value bufferPtr = builder.create<LLVM::ExtractValueOp>(
+          loc, inputStructPtrs[index], packedBuffersArgs,
+          rewriter.getI64ArrayAttr(index));
+      if (memrefType.hasStaticShape()) {
+        auto desc = MemRefDescriptor::fromStaticShape(
+            builder, loc, typeConverter, memrefType, bufferPtr);
+        rewriter.replaceOp(bufferOp, {desc});
+      } else {
+        auto desc = MemRefDescriptor::undef(
+            builder, loc, typeConverter.convertType(memrefType));
+        desc.setAllocatedPtr(builder, loc, bufferPtr);
+        desc.setAlignedPtr(builder, loc, bufferPtr);
+        rewriter.replaceOp(bufferOp, {desc});
+      }
+    }
+
+    // Lower hal.interface.load.constant ops into llvm.getelementptr, llvm.load
+    for (auto loadOp : loadOps) {
+      Value offset = builder.create<LLVM::ConstantOp>(
+          loc, LLVM::LLVMType::getInt64Ty(typeConverter.getDialect()),
+          builder.getI64IntegerAttr(loadOp.offset().getZExtValue()));
+      Value constPtr = builder.create<LLVM::GEPOp>(loc, pushConstantArgTy,
+                                                   newFuncOp.getArgument(1),
+                                                   ArrayRef<Value>({offset}));
+      Value dimConstant = builder.create<LLVM::LoadOp>(loc, constPtr);
+      rewriter.replaceOp(loadOp, dimConstant);
+    }
+
+    rewriter.eraseOp(funcOp);
+    return success();
+  }
+};
+
+class RemoveInterfaceOpPattern : public ConvertToLLVMPattern {
+ public:
+  explicit RemoveInterfaceOpPattern(MLIRContext *context,
+                                    LLVMTypeConverter &typeconverter)
+      : ConvertToLLVMPattern(IREE::HAL::InterfaceOp::getOperationName(),
+                             context, typeconverter) {}
+  LogicalResult matchAndRewrite(
+      Operation *op, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const override {
+    rewriter.eraseOp(op);
+    return success();
+  }
+};
+
 namespace {
 struct ConvertToLLVMPass
     : public PassWrapper<ConvertToLLVMPass, OperationPass<ModuleOp>> {
@@ -133,9 +309,25 @@
 }  // namespace
 
 void ConvertToLLVMPass::runOnOperation() {
+  // Vector -> Vector transformation is needed before we do any conversion to
+  // LLVM.
+  {
+    OwningRewritePatternList patterns;
+    vector::populateVectorToVectorCanonicalizationPatterns(patterns,
+                                                           &getContext());
+    vector::populateVectorSlicesLoweringPatterns(patterns, &getContext());
+    vector::populateVectorContractLoweringPatterns(patterns, &getContext());
+    applyPatternsAndFoldGreedily(getOperation(), patterns);
+  }
+  //
   auto module = getOperation();
-  OwningRewritePatternList patterns;
+
   LLVMTypeConverter converter(&getContext());
+  converter.addConversion([](Shape::RankedShapeType, SmallVectorImpl<Type> &) {
+    return success();
+  });
+
+  OwningRewritePatternList patterns;
   populateAffineToStdConversionPatterns(patterns, &getContext());
   populateLoopToStdConversionPatterns(patterns, &getContext());
   populateExpandTanhPattern(patterns, &getContext());
@@ -145,11 +337,12 @@
   populateVectorToLLVMConversionPatterns(converter, patterns);
   populateLinalgToLLVMConversionPatterns(converter, patterns, &getContext());
   // The following patterns resolves dynamic shapes by substituting tie_shape
-  // ops with an updated memref descriptors and replacing RankDimOp with actual
-  // index loaded from memref<?xi32> that holds all dynamic shapes
-  // push constants.
-  patterns.insert<ConvertRankedDimPattern, ConvertTieShapePattern,
-                  RemoveMakeRankedShape>(&getContext(), converter);
+  // ops with an updated memref descriptors and replacing RankDimOp with
+  // actual index loaded from memref<?xi32> that holds all dynamic shapes push
+  // constants.
+  patterns.insert<ConvertFuncWithHALInterface, ConvertRankedDimPattern,
+                  ConvertTieShapePattern, RemoveMakeRankedShape,
+                  RemoveInterfaceOpPattern>(&getContext(), converter);
   LLVMConversionTarget target(getContext());
   target.addLegalOp<ModuleOp, ModuleTerminatorOp>();
   if (failed(applyPartialConversion(module, target, patterns)))
@@ -162,7 +355,8 @@
 
 static PassRegistration<ConvertToLLVMPass> pass(
     "iree-codegen-convert-to-llvm",
-    "Perform final conversion from Linalg/HAL/Shape/Vector/Standard to LLVMIR "
+    "Perform final conversion from Linalg/HAL/Shape/Vector/Standard to "
+    "LLVMIR "
     "dialect",
     [] { return std::make_unique<ConvertToLLVMPass>(); });
 
diff --git a/iree/compiler/Conversion/LinalgToLLVM/HALInterfaceToMemrefArguments.cpp b/iree/compiler/Conversion/LinalgToLLVM/HALInterfaceToMemrefArguments.cpp
deleted file mode 100644
index ac968e4..0000000
--- a/iree/compiler/Conversion/LinalgToLLVM/HALInterfaceToMemrefArguments.cpp
+++ /dev/null
@@ -1,231 +0,0 @@
-// Copyright 2020 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <memory>
-
-#include "iree/compiler/Dialect/HAL/IR/HALDialect.h"
-#include "iree/compiler/Dialect/HAL/IR/HALOps.h"
-#include "iree/compiler/Dialect/IREE/IR/IREEOps.h"
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Pass/PassRegistry.h"
-#include "mlir/Transforms/DialectConversion.h"
-
-namespace mlir {
-namespace iree_compiler {
-namespace {
-
-/// Returns true if the given function contains interface related operations
-/// that are used by other ops.
-bool containsUsedInterfaceOp(FuncOp funcOp) {
-  for (Block& block : funcOp.getBlocks()) {
-    for (Operation& op : block) {
-      if (!op.getUses().empty() &&
-          (isa<IREE::PlaceholderOp>(op) ||
-           isa<IREE::HAL::InterfaceLoadConstantOp>(op))) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-/// Returns true if `aOp` has a desciptor (set, binding) pair smaller than
-/// `bOp`. Note that this ignores the offset.
-bool operator<(IREE::HAL::InterfaceBindingOp aOp,
-               IREE::HAL::InterfaceBindingOp bOp) {
-  if (aOp.set().getZExtValue() == bOp.set().getZExtValue())
-    return aOp.binding().getZExtValue() < bOp.binding().getZExtValue();
-  return aOp.set().getZExtValue() < bOp.set().getZExtValue();
-}
-
-/// A pattern to process function interface. It replaces interface related ops
-/// with function arguments to match LLVM's CodeGen's ABI contract.
-///
-/// IREE scheduler passes interface ABI information via hal.interface.* ops to
-/// all backends. We create iree.placeholder ops to represent buffers behind
-/// those hal.interface.* ops. However the LLVM CodeGen uses function parameters
-/// and memref descriptors for ABI. So we need to bridge the gap somewhere.
-///
-/// This pass finds all interface buffers used in the function, sort them
-/// according to the descriptor (set, binding) pair, and put unique ones as
-/// function parameters in order.
-/// Note: This should be kept consistent with LLVM's HAL backend.
-struct ProcessFuncInterfacePattern : public OpConversionPattern<FuncOp> {
-  using OpConversionPattern::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      FuncOp funcOp, ArrayRef<Value> Operands,
-      ConversionPatternRewriter& rewriter) const override {
-    // Only process entry functions.
-    if (SymbolTable::getSymbolVisibility(funcOp) !=
-        SymbolTable::Visibility::Public)
-      return failure();
-
-    FunctionType fnType = funcOp.getType();
-    if (fnType.getNumInputs() != 0)
-      return rewriter.notifyMatchFailure(
-          funcOp, "entry function should not have inputs");
-
-    // Get interface buffers from all the blocks.
-    SmallVector<IREE::PlaceholderOp, 8> bufferOps;
-    SmallVector<IREE::HAL::InterfaceLoadConstantOp, 8> loadOps;
-    for (Block& block : funcOp.getBlocks()) {
-      for (Operation& op : block) {
-        if (auto phOp = dyn_cast<IREE::PlaceholderOp>(op))
-          bufferOps.push_back(phOp);
-        if (auto phOp = dyn_cast<IREE::HAL::InterfaceLoadConstantOp>(op)) {
-          loadOps.push_back(phOp);
-        }
-      }
-    }
-
-    if (bufferOps.empty()) return failure();
-
-    // A map from buffer ops to their corresponding interface binding ops.
-    llvm::DenseMap<Operation*, IREE::HAL::InterfaceBindingOp> bufferBindingMap;
-    for (auto bufferOp : bufferOps) {
-      auto symbol = SymbolTable::lookupNearestSymbolFrom(
-          bufferOp, bufferOp.getAttrOfType<SymbolRefAttr>("binding"));
-      bufferBindingMap[bufferOp] = cast<IREE::HAL::InterfaceBindingOp>(symbol);
-    }
-
-    // Sort buffers according to their descriptor (set, binding) pair.
-    llvm::sort(bufferOps, [&bufferBindingMap](IREE::PlaceholderOp aBuffer,
-                                              IREE::PlaceholderOp bBuffer) {
-      return bufferBindingMap[aBuffer] < bufferBindingMap[bBuffer];
-    });
-
-    // Create a function argument for each of the unique binding pointed by the
-    // buffer ops.
-    TypeConverter::SignatureConversion signatureConverter(/*numOrigInputs=*/0);
-    // A map from buffer ops to their corresponding function argument indices.
-    llvm::DenseMap<Operation*, unsigned> bufferArgMap;
-    // A map from binding ops to their corresponding function argument indices.
-    llvm::DenseMap<Operation*, unsigned> bindingArgMap;
-    unsigned argIndex = 0;
-    for (auto bufferOp : bufferOps) {
-      auto binding = bufferBindingMap[bufferOp];
-      auto it = bindingArgMap.find(binding);
-      if (it != bindingArgMap.end()) {
-        bufferArgMap[bufferOp] = it->second;
-      } else {
-        bindingArgMap[binding] = argIndex;
-        bufferArgMap[bufferOp] = argIndex;
-        signatureConverter.addInputs(bufferOp.getType());
-        ++argIndex;
-      }
-    }
-    Type dynamicDimsBufferType =
-        MemRefType::get(ShapedType::kDynamicSize, rewriter.getIntegerType(32));
-    signatureConverter.addInputs(dynamicDimsBufferType);
-
-    // Create the new function's signature.
-    Location loc = funcOp.getLoc();
-    auto newFuncOp = rewriter.create<FuncOp>(
-        loc, funcOp.getName(),
-        rewriter.getFunctionType(signatureConverter.getConvertedTypes(),
-                                 llvm::None),
-        ArrayRef<NamedAttribute>());
-    newFuncOp.setAttr("llvm.emit_c_interface",
-                      mlir::UnitAttr::get(funcOp.getContext()));
-
-    // Move all ops in the old function's region to the new function.
-    rewriter.inlineRegionBefore(funcOp.getBody(), newFuncOp.getBody(),
-                                newFuncOp.end());
-    rewriter.applySignatureConversion(&newFuncOp.getBody(), signatureConverter);
-
-    // Replace all buffer ops' uses with the newly created function arguments
-    // and erase them.
-    for (auto bufferOp : bufferOps) {
-      bufferOp.replaceAllUsesWith(
-          newFuncOp.getArgument(bufferArgMap[bufferOp]));
-
-      rewriter.eraseOp(bufferOp);
-    }
-
-    // Lower all hal.interface.load.constant ops into std.load
-    // from the last buffer holding all dynamic dimensions with the proper
-    // offset.
-    Type indexType = rewriter.getIndexType();
-    auto builder = OpBuilder::atBlockBegin(&(newFuncOp.getBlocks().front()));
-    auto newLoc = newFuncOp.front().front().getLoc();
-    for (auto loadOp : loadOps) {
-      SmallVector<Value, 1> indices;
-      Value constantOffset = builder.create<ConstantOp>(
-          newLoc, indexType,
-          rewriter.getIntegerAttr(indexType, loadOp.offset().getZExtValue()));
-      indices.push_back(constantOffset);
-      Value loadDim = builder.create<LoadOp>(
-          newLoc, newFuncOp.getArgument(newFuncOp.getNumArguments() - 1),
-          indices);
-      Value loadDimIndex =
-          builder.create<IndexCastOp>(newLoc, loadDim, indexType);
-      loadOp.replaceAllUsesWith(loadDimIndex);
-      rewriter.eraseOp(loadOp);
-    }
-    rewriter.eraseOp(funcOp);
-    return success();
-  }
-};
-
-struct RemoveInterfaceOpPattern
-    : public OpRewritePattern<IREE::HAL::InterfaceOp> {
-  using OpRewritePattern::OpRewritePattern;
-  LogicalResult matchAndRewrite(IREE::HAL::InterfaceOp interfaceOp,
-                                PatternRewriter& rewriter) const override {
-    rewriter.eraseOp(interfaceOp);
-    return success();
-  }
-};
-
-/// Converting from Linalg to LLVM needs to run on a module and since it
-/// applies a full conversion, make a module with jst the impl function.
-struct HALInterfaceToMemrefArgumentsPass
-    : PassWrapper<HALInterfaceToMemrefArgumentsPass, OperationPass<ModuleOp>> {
-  void runOnOperation() override {
-    MLIRContext& context = getContext();
-
-    OwningRewritePatternList patterns;
-    patterns.insert<ProcessFuncInterfacePattern>(&context);
-    patterns.insert<RemoveInterfaceOpPattern>(&context);
-
-    ConversionTarget target(context);
-    // Convert the interface related ops away.
-    target.addDynamicallyLegalOp<FuncOp>(
-        [](FuncOp funcOp) { return !containsUsedInterfaceOp(funcOp); });
-    target.addIllegalOp<IREE::PlaceholderOp>();
-    target.addIllegalDialect<IREE::HAL::HALDialect>();
-    // Allow the rest.
-    target.markUnknownOpDynamicallyLegal([](Operation*) { return true; });
-
-    if (failed(applyFullConversion(getOperation(), target, patterns)))
-      return signalPassFailure();
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<OperationPass<ModuleOp>>
-createHALInterfaceToMemrefArgumentsPass() {
-  return std::make_unique<HALInterfaceToMemrefArgumentsPass>();
-}
-
-static PassRegistration<HALInterfaceToMemrefArgumentsPass> pass(
-    "iree-codegen-hal-interface-to-memref-arguments-pass",
-    "Convert a function with HAL bindings interface to memref arguments",
-    [] { return std::make_unique<HALInterfaceToMemrefArgumentsPass>(); });
-
-}  // namespace iree_compiler
-}  // namespace mlir
diff --git a/iree/compiler/Conversion/LinalgToLLVM/MatMulVectorization.cpp b/iree/compiler/Conversion/LinalgToLLVM/MatMulVectorization.cpp
new file mode 100644
index 0000000..a36358d
--- /dev/null
+++ b/iree/compiler/Conversion/LinalgToLLVM/MatMulVectorization.cpp
@@ -0,0 +1,98 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "iree/compiler/Conversion/CodegenUtils/MatmulCodegenStrategy.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassRegistry.h"
+
+namespace mlir {
+namespace iree_compiler {
+
+static llvm::cl::opt<int> l1TileSize(
+    "iree-codegen-linalg-to-llvm-matmul-l1-tile-size",
+    llvm::cl::desc("Specify the size of L1 tile for matmul vector lowering"),
+    llvm::cl::init(4));
+
+static llvm::cl::opt<int> l2TileSize(
+    "iree-codegen-linalg-to-llvm-matmul-l2-tile-size",
+    llvm::cl::desc("Specify the size of L2 tile for matmul vector lowering"),
+    llvm::cl::init(32));
+
+static llvm::cl::opt<int> l3TileSize(
+    "iree-codegen-linalg-to-llvm-matmul-l3-tile-size",
+    llvm::cl::desc("Specify the size of L3 tile for matmul vector lowering"),
+    llvm::cl::init(64));
+
+static llvm::cl::opt<bool> unrollVectorTransfer(
+    "iree-codegen-linalg-to-llvm-matmul-unroll-vector-transfer",
+    llvm::cl::desc("If true vector transfers operation loop get unrolled."),
+    llvm::cl::init(true));
+
+static llvm::cl::opt<std::string> vectorOpLowering(
+    "iree-codegen-linalg-to-llvm-matmul-vector-op-lowerig",
+    llvm::cl::desc(
+        "Select the vector operation for lowering linalg.matmul, options : "
+        "{'outer_product', 'vector_contract', 'matrix_internsics'}"),
+    llvm::cl::init("outer_product"));
+
+namespace {
+struct MatMulTileAndVectorizePass
+    : PassWrapper<MatMulTileAndVectorizePass, FunctionPass> {
+  void runOnFunction() override;
+};
+}  // namespace
+
+void MatMulTileAndVectorizePass::runOnFunction() {
+  FuncOp fn = getFunction();
+  MatmulCodegenStrategy strategy;
+  strategy
+      .tile<linalg::MatmulOp>(linalg::LinalgTilingOptions().setTileSizes(
+          {l3TileSize, l3TileSize, l3TileSize}))
+      .tile<linalg::MatmulOp>(linalg::LinalgTilingOptions().setTileSizes(
+          {l2TileSize, l2TileSize, l2TileSize}))
+      .tile<linalg::MatmulOp>(linalg::LinalgTilingOptions().setTileSizes(
+          {l1TileSize, l1TileSize, l1TileSize}))
+      .vectorize<linalg::MatmulOp>()
+      .setVectorTransferToSCFOptions(
+          VectorTransferToSCFOptions().setUnroll(unrollVectorTransfer));
+  if (vectorOpLowering == "outer_product") {
+    strategy.setVectorTransformsOptions(
+        vector::VectorTransformsOptions().setVectorTransformsOptions(
+            vector::VectorContractLowering::OuterProduct));
+  } else if (vectorOpLowering == "vector_contract") {
+    strategy.setVectorTransformsOptions(
+        vector::VectorTransformsOptions().setVectorTransformsOptions(
+            vector::VectorContractLowering::OuterProduct));
+  } else if (vectorOpLowering == "matrix_internsics") {
+    strategy.setVectorTransformsOptions(
+        vector::VectorTransformsOptions().setVectorTransformsOptions(
+            vector::VectorContractLowering::OuterProduct));
+  } else {
+    signalPassFailure();
+  }
+  strategy.setDefaultCPULowering();
+  strategy.transform(fn);
+}
+
+std::unique_ptr<FunctionPass> createMatMulTileAndVectorizePass() {
+  return std::make_unique<MatMulTileAndVectorizePass>();
+}
+
+static PassRegistration<MatMulTileAndVectorizePass> pass(
+    "iree-codegen-linalg-to-llvm-matmul-vectorization-pass",
+    "Tile and vectorize linalg.matmul operation",
+    [] { return std::make_unique<MatMulTileAndVectorizePass>(); });
+
+}  // namespace iree_compiler
+}  // namespace mlir
diff --git a/iree/compiler/Conversion/LinalgToLLVM/Passes.cpp b/iree/compiler/Conversion/LinalgToLLVM/Passes.cpp
index e8c6d9c..8631cdf 100644
--- a/iree/compiler/Conversion/LinalgToLLVM/Passes.cpp
+++ b/iree/compiler/Conversion/LinalgToLLVM/Passes.cpp
@@ -25,6 +25,8 @@
 namespace iree_compiler {
 
 void addLinalgToLLVMPasses(OpPassManager &passManager) {
+  // Linalg -> Vectors Ops.
+  passManager.addPass(createMatMulTileAndVectorizePass());
   // Linalg -> SCF
   passManager.addPass(createConvertLinalgToLoopsPass());
   passManager.addPass(createCanonicalizerPass());
@@ -35,10 +37,7 @@
   passManager.addPass(createCanonicalizerPass());
   passManager.addPass(createCSEPass());
 
-  // Convert ExecuableOp entry function to use memref arguments.
-  passManager.addPass(createHALInterfaceToMemrefArgumentsPass());
-
-  // (Linalg, STD) -> LLVM
+  // (HAL, IREE, Linalg, STD) -> LLVM
   // OpPassManager& llvmPassManager = passManager.nest<ModuleOp>();
   passManager.addPass(createConvertToLLVMPass());
   passManager.addPass(createCanonicalizerPass());
diff --git a/iree/compiler/Conversion/LinalgToLLVM/Passes.h b/iree/compiler/Conversion/LinalgToLLVM/Passes.h
index 5bfb893..2a4db8c 100644
--- a/iree/compiler/Conversion/LinalgToLLVM/Passes.h
+++ b/iree/compiler/Conversion/LinalgToLLVM/Passes.h
@@ -20,10 +20,8 @@
 namespace mlir {
 namespace iree_compiler {
 
-/// Converts function signture type from hal interface op annotation to memref
-/// argument.
-std::unique_ptr<OperationPass<ModuleOp>>
-createHALInterfaceToMemrefArgumentsPass();
+/// Converts linalg::MatmulOp into LLVM dialect
+std::unique_ptr<FunctionPass> createMatMulTileAndVectorizePass();
 
 /// Pass to perform final conversion to LLVM dialect.
 std::unique_ptr<OperationPass<ModuleOp>> createConvertToLLVMPass();
diff --git a/iree/compiler/Conversion/LinalgToLLVM/test/convert_to_llvm.mlir b/iree/compiler/Conversion/LinalgToLLVM/test/convert_to_llvm.mlir
index c8bf6d8..0a8034f 100644
--- a/iree/compiler/Conversion/LinalgToLLVM/test/convert_to_llvm.mlir
+++ b/iree/compiler/Conversion/LinalgToLLVM/test/convert_to_llvm.mlir
@@ -1,24 +1,36 @@
 // RUN: iree-opt -iree-codegen-convert-to-llvm -split-input-file %s | IreeFileCheck %s
 
-// CHECK-LABEL: func @convert_dynamic_shape
-func @convert_dynamic_shape(%arg0: memref<?x?xf32>, %arg1: memref<2xi32>){
-    %c0 = constant 0 : index
-    %c1 = constant 1 : index
-    %0 = load %arg1[%c0] : memref<2xi32>
-    %1 = index_cast %0 : i32 to index
-    %3 = load %arg1[%c1] : memref<2xi32>
-    %4 = index_cast %3 : i32 to index
-    %5 = shapex.make_ranked_shape %1, %4 : (index, index) -> !shapex.ranked_shape<[?,?]>
-    %6 = shapex.tie_shape %arg0, %5 : memref<?x?xf32>, !shapex.ranked_shape<[?,?]>
-    return
+func @convert_dynamic_shape() -> f32 {
+  %c0 = constant 0 : index
+  %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<?x?xf32>
+  %1 = hal.interface.load.constant offset = 0 : index
+  %2 = hal.interface.load.constant offset = 1 : index
+  %3 = shapex.make_ranked_shape %1, %2 : (index, index) -> !shapex.ranked_shape<[?,?]>
+  %6 = shapex.tie_shape %0, %3 : memref<?x?xf32>, !shapex.ranked_shape<[?,?]>
+  %7 = load %6[%c0, %c0] : memref<?x?xf32>
+  return %7 : f32
 }
-// CHECK: %[[DIM0:.+]] = llvm.sext
-// CHECK: %[[DIM1:.+]] = llvm.sext
-// CHECK: llvm.insertvalue %[[DIM0]], %[[MEMREFBASEPTR:.+]][3, 0]
-// CHECK: %[[MEMREFBASEPTR_1:.+]] = llvm.insertvalue %[[DIM1]], %[[MEMREFBASEPTR:.+]][3, 1]
-// CHECK: %[[STRIDE1:.+]] = llvm.mlir.constant(1 : index) : !llvm.i64
-// CHECK: %[[MEMREFBASEPTR_2:.+]] = llvm.insertvalue %[[STRIDE1]], %[[MEMREFBASEPTR_1]][4, 1]
-// CHECK: %[[ESTRIDE1:.+]] = llvm.extractvalue %[[MEMREFBASEPTR_2]][4, 1] 
-// CHECK: %[[EDIM1:.+]] = llvm.extractvalue %[[MEMREFBASEPTR_2]][3, 1] 
-// CHECK: %[[STRIDE0:.+]] = llvm.mul %[[ESTRIDE1]], %[[EDIM1]] : !llvm.i64
-// CHECK: llvm.insertvalue %[[STRIDE0]], %[[MEMREFBASEPTR_2]][4, 0]
\ No newline at end of file
+hal.interface @legacy_io attributes {push_constants = 2 : i32, sym_visibility = "private"} {
+    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
+}
+// CHECK: llvm.func @convert_dynamic_shape(%[[ARG0:.+]]: !llvm<"i8**">, %[[ARG1:.+]]: !llvm<"i64*">)
+// CHECK: %[[PACKED_ARGS_PTR:.+]] = llvm.bitcast %[[ARG0]] : !llvm<"i8**"> to !llvm<"{ float* }*">
+// CHECK: %[[PACKED_ARGS:.+]] = llvm.load %[[PACKED_ARGS_PTR]] : !llvm<"{ float* }*">
+// CHECK: %[[MEMREF0_DATA_PTR:.+]] = llvm.extractvalue %[[PACKED_ARGS]][0] : !llvm<"{ float* }">
+// CHECK: %[[MEMREF0:.+]] = llvm.mlir.undef : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+// CHECK: %[[MEMREF0_0:.+]] = llvm.insertvalue %[[MEMREF0_DATA_PTR]], %[[MEMREF0]][0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+// CHECK: %[[MEMREF0_1:.+]] = llvm.insertvalue %[[MEMREF0_DATA_PTR]], %[[MEMREF0_0]][1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+// CHECK: %[[CONST0:.+]] = llvm.mlir.constant(0 : i64) : !llvm.i64
+// CHECK: %[[DIM0_PTR:.+]] = llvm.getelementptr %[[ARG1]][%[[CONST0]]] : (!llvm<"i64*">, !llvm.i64) -> !llvm<"i64*">
+// CHECK: %[[DIM0:.+]] = llvm.load %[[DIM0_PTR]] : !llvm<"i64*">
+// CHECK: %[[CONST1:.+]] = llvm.mlir.constant(1 : i64) : !llvm.i64
+// CHECK: %[[DIM1_PTR:.+]] = llvm.getelementptr %[[ARG1]][%[[CONST1]]] : (!llvm<"i64*">, !llvm.i64) -> !llvm<"i64*">
+// CHECK: %[[DIM1:.+]] = llvm.load %[[DIM1_PTR]] : !llvm<"i64*">
+// CHECK: %[[MEMREF0_2:.+]] = llvm.insertvalue %[[DIM0]], %[[MEMREF0_1]][3, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+// CHECK: %[[MEMREF0_3:.+]] = llvm.insertvalue %[[DIM1]], %[[MEMREF0_2]][3, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+// CHECK: %[[CONST1_STRIDE:.+]] = llvm.mlir.constant(1 : index) : !llvm.i64
+// CHECK: %[[MEMREF0_4:.+]] = llvm.insertvalue %[[CONST1_STRIDE]], %[[MEMREF0_3]][4, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+// CHECK: %[[STRIDE_DIM1:.+]] = llvm.extractvalue %[[MEMREF0_4]][4, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+// CHECK: %[[DIM1_0:.+]] = llvm.extractvalue %[[MEMREF0_4]][3, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+// CHECK: %[[STRIDE_DIM0:.+]] = llvm.mul %[[STRIDE_DIM1]], %[[DIM1_0]] : !llvm.i64
+// CHECK: llvm.insertvalue %[[STRIDE_DIM0]], %[[MEMREF0_4]][4, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
diff --git a/iree/compiler/Conversion/LinalgToLLVM/test/matmul_vectorization.mlir b/iree/compiler/Conversion/LinalgToLLVM/test/matmul_vectorization.mlir
new file mode 100644
index 0000000..7d47f47
--- /dev/null
+++ b/iree/compiler/Conversion/LinalgToLLVM/test/matmul_vectorization.mlir
@@ -0,0 +1,31 @@
+// RUN: iree-opt --iree-codegen-linalg-to-llvm-matmul-vectorization-pass -split-input-file %s | IreeFileCheck %s
+
+// CHECK-LABEL: func @matmul_128x128x128
+// CHECK-SAME: (%[[ARG0:.+]]: memref<128x128xf32>, %[[ARG1:.+]]: memref<128x128xf32>, %[[ARG2:.+]]: memref<128x128xf32>)
+func @matmul_128x128x128(%arg0 : memref<128x128xf32>, %arg1: memref<128x128xf32>, %arg2: memref<128x128xf32>) {
+    linalg.matmul %arg0, %arg1, %arg2 : (memref<128x128xf32>, memref<128x128xf32>, memref<128x128xf32>)
+    return
+}
+// CHECK: %[[L3END:.+]] = constant 128 : index
+// CHECK: %[[L3STEP:.+]] = constant 64 : index
+// CHECK: %[[L1STEP:.+]] = constant 4 : index
+// CHECK: %[[L2STEP:.+]] = constant 32 : index
+// CHECK: %[[START:.+]] = constant 0 : index
+// CHECK: scf.for %[[IL3:.+]] = %[[START]] to %[[L3END]] step %[[L3STEP]]
+// CHECK: scf.for %[[JL3:.+]] = %[[START]] to %[[L3END]] step %[[L3STEP]]
+// CHECK: scf.for %[[KL3:.+]] = %[[START]] to %[[L3END]] step %[[L3STEP]]
+// CHECK: %[[ARG0_TILE_L3:.+]] = subview %[[ARG0]][%[[IL3]], %[[KL3]]] [64, 64] [1, 1] : memref<128x128xf32> to memref<64x64xf32
+// CHECK: %[[ARG1_TILE_L3:.+]] = subview %[[ARG1]][%[[KL3]], %[[JL3]]] [64, 64] [1, 1] : memref<128x128xf32> to memref<64x64xf32
+// CHECK: %[[ARG2_TILE_L3:.+]] = subview %[[ARG2]][%[[IL3]], %[[JL3]]] [64, 64] [1, 1] : memref<128x128xf32> to memref<64x64xf32
+// CHECK: scf.for %[[IL2:.+]] = %[[START]] to %[[L3STEP]] step %[[L2STEP]]
+// CHECK: scf.for %[[JL2:.+]] = %[[START]] to %[[L3STEP]] step %[[L2STEP]]
+// CHECK: scf.for %[[KL2:.+]] = %[[START]] to %[[L3STEP]] step %[[L2STEP]]
+// CHECK: %[[ARG0_TILE_L2:.+]] = subview %[[ARG0_TILE_L3]][%[[IL2]], %[[KL2]]] [32, 32] [1, 1] : memref<64x64xf32
+// CHECK: %[[ARG1_TILE_L2:.+]] = subview %[[ARG1_TILE_L3]][%[[KL2]], %[[JL2]]] [32, 32] [1, 1] : memref<64x64xf32
+// CHECK: %[[ARG2_TILE_L2:.+]] = subview %[[ARG2_TILE_L3]][%[[IL2]], %[[JL2]]] [32, 32] [1, 1] : memref<64x64xf32
+// CHECK: scf.for %[[IL1:.+]] = %[[START]] to %[[L2STEP]] step %[[L1STEP]]
+// CHECK: scf.for %[[JL1:.+]] = %[[START]] to %[[L2STEP]] step %[[L1STEP]]
+// CHECK: scf.for %[[KL1:.+]] = %[[START]] to %[[L2STEP]] step %[[L1STEP]]
+// CHECK: %[[ARG0_TILE_L1:.+]] = subview %[[ARG0_TILE_L2]][%[[IL1]], %[[KL1]]] [4, 4] [1, 1] : memref<32x32xf32
+// CHECK: %[[ARG1_TILE_L1:.+]] = subview %[[ARG1_TILE_L2]][%[[KL1]], %[[JL1]]] [4, 4] [1, 1] : memref<32x32xf32
+// CHECK: %[[ARG2_TILE_L1:.+]] = subview %[[ARG2_TILE_L2]][%[[IL1]], %[[JL1]]] [4, 4] [1, 1] : memref<32x32xf32
diff --git a/iree/compiler/Conversion/LinalgToSPIRV/ConvertToGPUPass.cpp b/iree/compiler/Conversion/LinalgToSPIRV/ConvertToGPUPass.cpp
index 98e91fa..81514ec 100644
--- a/iree/compiler/Conversion/LinalgToSPIRV/ConvertToGPUPass.cpp
+++ b/iree/compiler/Conversion/LinalgToSPIRV/ConvertToGPUPass.cpp
@@ -563,7 +563,7 @@
       ConversionPatternRewriter &rewriter) const override {
     // Check for marker that specifies that the linalg op is to be partitioned
     // across threads within a workgroup.
-    if (!hasWorkItemMarker(linalgOp)) return failure();
+    if (!hasWorkGroupMarker(linalgOp)) return failure();
     Optional<linalg::LinalgLoops> loops =
         linalg::linalgLowerOpToLoops<scf::ParallelOp>(rewriter, linalgOp);
     if (!loops) return failure();
@@ -587,7 +587,7 @@
   LogicalResult matchAndRewrite(
       LinalgOpTy linalgOp, ArrayRef<Value> operands,
       ConversionPatternRewriter &rewriter) const override {
-    if (!hasWorkItemMarker(linalgOp)) return failure();
+    if (!hasWorkGroupMarker(linalgOp)) return failure();
     Optional<linalg::LinalgLoops> loops =
         linalg::linalgLowerOpToLoops<scf::ParallelOp>(rewriter, linalgOp);
     if (!loops) return failure();
diff --git a/iree/compiler/Conversion/LinalgToSPIRV/LinalgTileAndFusePass.cpp b/iree/compiler/Conversion/LinalgToSPIRV/LinalgTileAndFusePass.cpp
index e9dddd6..934e5ae 100644
--- a/iree/compiler/Conversion/LinalgToSPIRV/LinalgTileAndFusePass.cpp
+++ b/iree/compiler/Conversion/LinalgToSPIRV/LinalgTileAndFusePass.cpp
@@ -314,7 +314,7 @@
 
   LogicalResult matchAndRewrite(Operation *op,
                                 PatternRewriter &rewriter) const override {
-    if (!hasWorkItemMarker(op)) return failure();
+    if (!hasWorkGroupMarker(op)) return failure();
     return linalg::LinalgPromotionPattern<linalg::MatmulOp>::matchAndRewrite(
         op, rewriter);
   }
@@ -365,7 +365,7 @@
           .setLoopType(linalg::LinalgTilingLoopType::ParallelLoops),
       tileSizeCalculator.getWorkGroupSize(),
       linalg::LinalgMarker(ArrayRef<Identifier>(),
-                           Identifier::get(getWorkItemMarker(), context)));
+                           Identifier::get(getWorkGroupMarker(), context)));
   applyPatternsAndFoldGreedily(getOperation(), tilingPatterns);
 
   if (useWorkgroupMemory) {
@@ -385,7 +385,7 @@
                 [&](OpBuilder &b, Value src, Value dst) -> LogicalResult {
                   return copyToFromWorkgroupMemory(b, src, dst);
                 }),
-        linalg::LinalgMarker(Identifier::get(getWorkItemMarker(), context),
+        linalg::LinalgMarker(Identifier::get(getWorkGroupMarker(), context),
                              Identifier::get(PromotionMarker, context)));
     applyPatternsAndFoldGreedily(getOperation(), promotionPatterns);
   }
@@ -394,7 +394,7 @@
   OpBuilder builder(context);
   funcOp.walk([&builder](linalg::LinalgOp linalgOp) {
     if (hasMarker(linalgOp, PromotionMarker)) {
-      setWorkItemMarker(linalgOp);
+      setWorkGroupMarker(linalgOp);
       insertBarrierAfter(builder, linalgOp.getLoc(), linalgOp);
     }
   });
diff --git a/iree/compiler/Conversion/LinalgToSPIRV/MarkerUtils.cpp b/iree/compiler/Conversion/LinalgToSPIRV/MarkerUtils.cpp
index c874234..47747de 100644
--- a/iree/compiler/Conversion/LinalgToSPIRV/MarkerUtils.cpp
+++ b/iree/compiler/Conversion/LinalgToSPIRV/MarkerUtils.cpp
@@ -36,8 +36,6 @@
 
 StringRef getWorkGroupMarker() { return "workgroup"; }
 
-StringRef getWorkItemMarker() { return "workitem"; }
-
 bool hasMarker(Operation *op, StringRef marker) {
   return checkMarkerValue(op, marker);
 }
@@ -46,10 +44,6 @@
   return checkMarkerValue(op, getWorkGroupMarker());
 }
 
-bool hasWorkItemMarker(Operation *op) {
-  return checkMarkerValue(op, getWorkItemMarker());
-}
-
 void setMarker(Operation *op, StringRef marker) {
   op->setAttr(linalg::LinalgTransforms::kLinalgTransformMarker,
               StringAttr::get(marker, op->getContext()));
@@ -57,6 +51,5 @@
 
 void setWorkGroupMarker(Operation *op) { setMarker(op, getWorkGroupMarker()); }
 
-void setWorkItemMarker(Operation *op) { setMarker(op, getWorkItemMarker()); }
 }  // namespace iree_compiler
 }  // namespace mlir
diff --git a/iree/compiler/Conversion/LinalgToSPIRV/MarkerUtils.h b/iree/compiler/Conversion/LinalgToSPIRV/MarkerUtils.h
index 36dccca..e512ead 100644
--- a/iree/compiler/Conversion/LinalgToSPIRV/MarkerUtils.h
+++ b/iree/compiler/Conversion/LinalgToSPIRV/MarkerUtils.h
@@ -31,7 +31,7 @@
 namespace iree_compiler {
 
 /// Marker to denote that a linalg operation is to be partitioned to workitems.
-StringRef getWorkItemMarker();
+StringRef getWorkGroupMarker();
 
 /// Returns true if an operation has the specified `marker`. When `marker` is
 /// empty, returns true if the operation has any marker.
@@ -39,14 +39,14 @@
 
 /// Returns true if an operation has marker to denote that it is to be
 /// partitioned to workitems.
-bool hasWorkItemMarker(Operation *);
+bool hasWorkGroupMarker(Operation *);
 
 /// Sets a given marker on an operation.
 void setMarker(Operation *, StringRef);
 
 /// Sets marker to denote that a linalg operation is to be partitioned to
 /// workitems.
-void setWorkItemMarker(Operation *);
+void setWorkGroupMarker(Operation *);
 
 }  // namespace iree_compiler
 }  // namespace mlir
diff --git a/iree/compiler/Conversion/LinalgToSPIRV/test/convert_to_gpu.mlir b/iree/compiler/Conversion/LinalgToSPIRV/test/convert_to_gpu.mlir
index 679f523..64621f3 100644
--- a/iree/compiler/Conversion/LinalgToSPIRV/test/convert_to_gpu.mlir
+++ b/iree/compiler/Conversion/LinalgToSPIRV/test/convert_to_gpu.mlir
@@ -162,7 +162,7 @@
         %12 = dim %arg2, %c1 : memref<?x?xf32>
         %13 = affine.min #map0(%arg4)[%12]
         %14 = subview %arg2[%arg3, %arg4] [%11, %13] [1, 1]  : memref<?x?xf32> to memref<?x?xf32, #map2>
-        linalg.matmul %5, %9, %14 {__internal_linalg_transform__ = "workitem"} : (memref<?x?xf32, #map2>, memref<?x?xf32, #map2>, memref<?x?xf32, #map2>)
+        linalg.matmul %5, %9, %14 {__internal_linalg_transform__ = "workgroup"} : (memref<?x?xf32, #map2>, memref<?x?xf32, #map2>, memref<?x?xf32, #map2>)
       }
       scf.yield
     }
@@ -235,7 +235,7 @@
       %13 = affine.min #map5(%arg5)[%4]
       %14 = dim %arg2, %c3 : memref<?x?x?x?xf32>
       %15 = subview %arg2[%arg3, %arg4, %arg5, 0] [%11, %12, %13, %14] [1, 1, 1, 1]  : memref<?x?x?x?xf32> to memref<?x?x?x?xf32, #map3>
-      linalg.conv(%arg0, %9, %15) {__internal_linalg_transform__ = "workitem", dilations = [1, 1], strides = [1, 1]} : memref<?x?x?x?xf32>, memref<?x?x?x?xf32, #map3>, memref<?x?x?x?xf32, #map3>
+      linalg.conv(%arg0, %9, %15) {__internal_linalg_transform__ = "workgroup", dilations = [1, 1], strides = [1, 1]} : memref<?x?x?x?xf32>, memref<?x?x?x?xf32, #map3>, memref<?x?x?x?xf32, #map3>
       scf.yield
     }
     return
@@ -364,7 +364,7 @@
       %9 = affine.min #map3(%arg3)[%2]
       %10 = affine.min #map4(%arg4)[%3]
       %11 = subview %arg2[%arg3, %arg4] [%9, %10] [1, 1]  : memref<?x?xf32> to memref<?x?xf32, #map2>
-      linalg.pooling_max(%8, %arg1, %11) {__internal_linalg_transform__ = "workitem", dilations = [1, 1], strides = [1, 1]} : memref<?x?xf32, #map2>, memref<?x?xf32>, memref<?x?xf32, #map2>
+      linalg.pooling_max(%8, %arg1, %11) {__internal_linalg_transform__ = "workgroup", dilations = [1, 1], strides = [1, 1]} : memref<?x?xf32, #map2>, memref<?x?xf32>, memref<?x?xf32, #map2>
       scf.yield
     }
     return
diff --git a/iree/compiler/Conversion/LinalgToSPIRV/test/convert_to_gpu_option.mlir b/iree/compiler/Conversion/LinalgToSPIRV/test/convert_to_gpu_option.mlir
index 1701535..63f8aa5 100644
--- a/iree/compiler/Conversion/LinalgToSPIRV/test/convert_to_gpu_option.mlir
+++ b/iree/compiler/Conversion/LinalgToSPIRV/test/convert_to_gpu_option.mlir
@@ -32,7 +32,7 @@
       %13 = affine.min #map5(%arg5)[%4]
       %14 = dim %arg2, %c3 : memref<?x?x?x?xf32>
       %15 = subview %arg2[%arg3, %arg4, %arg5, 0] [%11, %12, %13, %14] [1, 1, 1, 1]  : memref<?x?x?x?xf32> to memref<?x?x?x?xf32, #map3>
-      linalg.conv(%arg0, %9, %15) {__internal_linalg_transform__ = "workitem", dilations = [1, 1], strides = [1, 1]} : memref<?x?x?x?xf32>, memref<?x?x?x?xf32, #map3>, memref<?x?x?x?xf32, #map3>
+      linalg.conv(%arg0, %9, %15) {__internal_linalg_transform__ = "workgroup", dilations = [1, 1], strides = [1, 1]} : memref<?x?x?x?xf32>, memref<?x?x?x?xf32, #map3>, memref<?x?x?x?xf32, #map3>
       scf.yield
     }
     return
diff --git a/iree/compiler/Conversion/LinalgToSPIRV/test/cyclic_to_workgroup.mlir b/iree/compiler/Conversion/LinalgToSPIRV/test/cyclic_to_workgroup.mlir
index 110ac24..cac18ab 100644
--- a/iree/compiler/Conversion/LinalgToSPIRV/test/cyclic_to_workgroup.mlir
+++ b/iree/compiler/Conversion/LinalgToSPIRV/test/cyclic_to_workgroup.mlir
@@ -27,7 +27,7 @@
         %12 = dim %arg2, %c1 : memref<?x?xf32>
         %13 = affine.min #map0(%arg4)[%12]
         %14 = subview %arg2[%arg3, %arg4] [%11, %13] [1, 1]  : memref<?x?xf32> to memref<?x?xf32, #map2>
-        linalg.matmul %5, %9, %14 {__internal_linalg_transform__ = "workitem"} : (memref<?x?xf32, #map2>, memref<?x?xf32, #map2>, memref<?x?xf32, #map2>)
+        linalg.matmul %5, %9, %14 {__internal_linalg_transform__ = "workgroup"} : (memref<?x?xf32, #map2>, memref<?x?xf32, #map2>, memref<?x?xf32, #map2>)
       }
       scf.yield
     }
diff --git a/iree/compiler/Conversion/LinalgToSPIRV/test/linalg_tile_and_fuse.mlir b/iree/compiler/Conversion/LinalgToSPIRV/test/linalg_tile_and_fuse.mlir
index 0e2fe6d..1728d35 100644
--- a/iree/compiler/Conversion/LinalgToSPIRV/test/linalg_tile_and_fuse.mlir
+++ b/iree/compiler/Conversion/LinalgToSPIRV/test/linalg_tile_and_fuse.mlir
@@ -51,7 +51,7 @@
 //       CHECK:     %[[VIEW2:.+]] = subview %[[ARG2]]
 //       CHECK:     linalg.conv
 //  CHECK-SAME:       %[[ARG0]], %[[VIEW1]], %[[VIEW2]]
-//  CHECK-SAME:       "workitem"
+//  CHECK-SAME:       "workgroup"
 
 // -----
 
@@ -81,7 +81,7 @@
 //       CHECK:     %[[VIEW1:.+]] = subview %[[ARG1]]
 //       CHECK:     %[[VIEW2:.+]] = subview %[[ARG2]]
 //       CHECK:     linalg.matmul
-//  CHECK-SAME:       "workitem"
+//  CHECK-SAME:       "workgroup"
 //  CHECK-SAME:       %[[VIEW0]], %[[VIEW1]], %[[VIEW2]]
 
 // -----
@@ -111,4 +111,4 @@
 //       CHECK:     %[[VIEW2:.+]] = subview %[[ARG2]]
 //       CHECK:     linalg.pooling_max
 //  CHECK-SAME:       %[[VIEW0]], %[[ARG1]], %[[VIEW2]]
-//  CHECK-SAME:       "workitem"
+//  CHECK-SAME:       "workgroup"
diff --git a/iree/compiler/Conversion/LinalgToSPIRV/test/workgroup_memory_promotion.mlir b/iree/compiler/Conversion/LinalgToSPIRV/test/workgroup_memory_promotion.mlir
index 76cfcb8..a24c77b 100644
--- a/iree/compiler/Conversion/LinalgToSPIRV/test/workgroup_memory_promotion.mlir
+++ b/iree/compiler/Conversion/LinalgToSPIRV/test/workgroup_memory_promotion.mlir
@@ -36,12 +36,12 @@
 //      CHECK:     %[[ALLOC2:.+]] = alloc(%[[C4]], %[[C8]]) : memref<?x?xf32, 3>
 //      CHECK:     %[[SUBVIEW2:.+]] = subview %[[ALLOC2]]
 //      CHECK:     linalg.copy(%[[ARG0SV]], %[[SUBVIEW1]])
-// CHECK-SAME:       "workitem"
+// CHECK-SAME:       "workgroup"
 //      CHECK:     spv.ControlBarrier "Workgroup", "Workgroup", "AcquireRelease"
 //      CHECK:     linalg.copy(%[[ARG1SV]], %[[SUBVIEW2]])
-// CHECK-SAME:       "workitem"
+// CHECK-SAME:       "workgroup"
 //      CHECK:     spv.ControlBarrier "Workgroup", "Workgroup", "AcquireRelease"
-//      CHECK:     linalg.matmul {{.*}}"workitem"{{.*}} %[[SUBVIEW1]], %[[SUBVIEW2]], %[[RET0SV]]
+//      CHECK:     linalg.matmul {{.*}}"workgroup"{{.*}} %[[SUBVIEW1]], %[[SUBVIEW2]], %[[RET0SV]]
 //      CHECK:     spv.ControlBarrier "Workgroup", "Workgroup", "AcquireRelease"
 //  CHECK-DAG:     dealloc %[[ALLOC1]] : memref<?x?xf32, 3>
 //  CHECK-DAG:     dealloc %[[ALLOC2]] : memref<?x?xf32, 3>
diff --git a/iree/compiler/Conversion/init_conversions.h b/iree/compiler/Conversion/init_conversions.h
index 7a190e7..259e3d5 100644
--- a/iree/compiler/Conversion/init_conversions.h
+++ b/iree/compiler/Conversion/init_conversions.h
@@ -47,7 +47,6 @@
 inline void registerLinalgToLLVMPasses() {
   static bool init_once = []() {
     // LinalgToLLVM
-    createHALInterfaceToMemrefArgumentsPass();
     return true;
   }();
   (void)init_once;
diff --git a/iree/compiler/Dialect/Flow/IR/FlowOps.td b/iree/compiler/Dialect/Flow/IR/FlowOps.td
index 23bce9c..13ad2f1 100644
--- a/iree/compiler/Dialect/Flow/IR/FlowOps.td
+++ b/iree/compiler/Dialect/Flow/IR/FlowOps.td
@@ -610,6 +610,17 @@
   let hasFolder = 1;
 }
 
+def FLOW_TensorTraceOp : FLOW_Op<"tensor.trace", []> {
+  let summary = [{trace value(s) operation}];
+  let description = [{
+    Trace point for dispatchable functions.
+  }];
+
+  let arguments = (ins Variadic<FLOW_Tensor>:$operands);
+
+  let assemblyFormat = "attr-dict ($operands^ `:` type($operands))?";
+}
+
 //===----------------------------------------------------------------------===//
 // Streams
 //===----------------------------------------------------------------------===//
diff --git a/iree/compiler/Dialect/Flow/Transforms/OutlineDispatchRegions.cpp b/iree/compiler/Dialect/Flow/Transforms/OutlineDispatchRegions.cpp
index 44a19d1..3281c6c 100644
--- a/iree/compiler/Dialect/Flow/Transforms/OutlineDispatchRegions.cpp
+++ b/iree/compiler/Dialect/Flow/Transforms/OutlineDispatchRegions.cpp
@@ -25,6 +25,7 @@
 #include "mlir/IR/BlockAndValueMapping.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/StandardTypes.h"
 #include "mlir/IR/SymbolTable.h"
 #include "mlir/Pass/Pass.h"
 
@@ -40,6 +41,11 @@
 
 namespace {
 
+static llvm::cl::opt<bool> traceDispatchTensors(
+    "iree-flow-trace-dispatch-tensors",
+    llvm::cl::desc("Trace input/output values for each dispatch function"),
+    llvm::cl::init(false));
+
 // Converts a dispatch_region into a dispatch to the outlined region function.
 LogicalResult convertToDispatchOp(DispatchRegionOp regionOp,
                                   ExecutableOp executableOp,
@@ -57,11 +63,28 @@
     return failure();
   }
 
+  auto getTensorTypeArgs = [](auto args) {
+    SmallVector<Value, 4> res;
+    for (auto arg : args) {
+      if (arg.getType().template isa<TensorType>()) res.push_back(arg);
+    }
+    return res;
+  };
+  if (traceDispatchTensors) {
+    builder.create<TensorTraceOp>(regionOp.getLoc(),
+                                  getTensorTypeArgs(newArgs));
+  }
+
   // Create the dispatch op to the executable function.
   auto dispatchOp = builder.create<DispatchOp>(
       regionOp.getLoc(), executableOp.getName(), entryPointOp.getName(),
       regionOp.workload(), outlinedFuncOp.getType().getResults(), newArgs);
 
+  if (traceDispatchTensors) {
+    builder.create<TensorTraceOp>(regionOp.getLoc(),
+                                  getTensorTypeArgs(dispatchOp.getResults()));
+  }
+
   // Replace uses of the existing results with the new results.
   for (int i = 0; i < regionOp.getNumResults(); ++i) {
     regionOp.getResult(i).replaceAllUsesWith(dispatchOp.getResult(i));
diff --git a/iree/compiler/Dialect/HAL/Conversion/FlowToHAL/ConvertTensorOps.cpp b/iree/compiler/Dialect/HAL/Conversion/FlowToHAL/ConvertTensorOps.cpp
index 2947b70..6fcadbd 100644
--- a/iree/compiler/Dialect/HAL/Conversion/FlowToHAL/ConvertTensorOps.cpp
+++ b/iree/compiler/Dialect/HAL/Conversion/FlowToHAL/ConvertTensorOps.cpp
@@ -19,6 +19,7 @@
 #include "iree/compiler/Dialect/HAL/Utils/TypeUtils.h"
 #include "iree/compiler/Dialect/IREE/IR/IREETypes.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BlockAndValueMapping.h"
@@ -120,13 +121,36 @@
   }
 };
 
+class TensorTraceOpConversion
+    : public OpConversionPattern<IREE::Flow::TensorTraceOp> {
+ public:
+  TensorTraceOpConversion(MLIRContext *ctx, TypeConverter &converter)
+      : OpConversionPattern(ctx) {}
+
+  LogicalResult matchAndRewrite(
+      IREE::Flow::TensorTraceOp traceOp, llvm::ArrayRef<Value> rawOperands,
+      ConversionPatternRewriter &rewriter) const override {
+    Location loc = traceOp.getLoc();
+    SmallVector<Value, 4> bufferViews;
+    for (auto operand : llvm::enumerate(rawOperands)) {
+      auto adaptor = IREE::HAL::TensorRewriteAdaptor::get(
+          loc, traceOp.getOperand(operand.index()), operand.value(), rewriter);
+      bufferViews.emplace_back(adaptor.getBufferView());
+    }
+    rewriter.replaceOpWithNewOp<IREE::HAL::BufferViewTraceOp>(traceOp,
+                                                              bufferViews);
+    return success();
+  }
+};
+
 }  // namespace
 
 void populateFlowTensorToHALPatterns(MLIRContext *context,
                                      OwningRewritePatternList &patterns,
                                      TypeConverter &converter) {
   patterns.insert<ConstantTensorOpConversion, TensorLoadOpConversion,
-                  TensorStoreOpConversion>(context, converter);
+                  TensorStoreOpConversion, TensorTraceOpConversion>(context,
+                                                                    converter);
 }
 
 }  // namespace iree_compiler
diff --git a/iree/compiler/Dialect/HAL/Conversion/HALToVM/ConvertBufferViewOps.cpp b/iree/compiler/Dialect/HAL/Conversion/HALToVM/ConvertBufferViewOps.cpp
index 7212549..5abbfa4 100644
--- a/iree/compiler/Dialect/HAL/Conversion/HALToVM/ConvertBufferViewOps.cpp
+++ b/iree/compiler/Dialect/HAL/Conversion/HALToVM/ConvertBufferViewOps.cpp
@@ -118,6 +118,8 @@
       context, importSymbols, typeConverter, "hal.buffer_view.dim");
   patterns.insert<BufferViewDimsOpConversion>(
       context, importSymbols, typeConverter, "hal.buffer_view.dims");
+  patterns.insert<VMImportOpConversion<IREE::HAL::BufferViewTraceOp>>(
+      context, importSymbols, typeConverter, "hal.buffer_view.trace");
 }
 
 }  // namespace iree_compiler
diff --git a/iree/compiler/Dialect/HAL/IR/HALOps.td b/iree/compiler/Dialect/HAL/IR/HALOps.td
index 0139ddb..9593647 100644
--- a/iree/compiler/Dialect/HAL/IR/HALOps.td
+++ b/iree/compiler/Dialect/HAL/IR/HALOps.td
@@ -927,6 +927,17 @@
   let assemblyFormat = [{$buffer_view attr-dict `:` type($result)}];
 }
 
+def HAL_BufferViewTraceOp : HAL_Op<"buffer_view.trace", []> {
+  let summary = [{trace value(s) operation}];
+  let description = [{
+    Trace point for dispatchable functions.
+  }];
+
+  let arguments = (ins Variadic<HAL_BufferView>:$operands);
+
+  let assemblyFormat = "attr-dict ($operands^ `:` type($operands))?";
+}
+
 //===----------------------------------------------------------------------===//
 // iree::hal::CommandBuffer
 //===----------------------------------------------------------------------===//
diff --git a/iree/compiler/Dialect/HAL/Target/LLVM/BUILD b/iree/compiler/Dialect/HAL/Target/LLVM/BUILD
index 2e322af..0f6be4a 100644
--- a/iree/compiler/Dialect/HAL/Target/LLVM/BUILD
+++ b/iree/compiler/Dialect/HAL/Target/LLVM/BUILD
@@ -105,9 +105,7 @@
 cc_library(
     name = "LLVMAOTTargetLinker",
     hdrs = ["LLVMAOTTargetLinker.h"],
-    deps = [
-        "//iree/base:file_io",
-    ] + platform_trampoline_deps("LLVMAOTTargetLinker", "compiler/Dialect/HAL/Target/LLVM"),
+    deps = platform_trampoline_deps("LLVMAOTTargetLinker", "compiler/Dialect/HAL/Target/LLVM"),
 )
 
 cc_library(
@@ -115,6 +113,6 @@
     hdrs = ["LLVMAOTTargetLinker.h"],
     deps = [
         ":LLVMTargetOptions",
-        "//iree/base:file_io",
+        "//iree/base:status",
     ],
 )
diff --git a/iree/compiler/Dialect/HAL/Target/LLVM/CMakeLists.txt b/iree/compiler/Dialect/HAL/Target/LLVM/CMakeLists.txt
index 4aa0ad3..0ee00e4 100644
--- a/iree/compiler/Dialect/HAL/Target/LLVM/CMakeLists.txt
+++ b/iree/compiler/Dialect/HAL/Target/LLVM/CMakeLists.txt
@@ -97,7 +97,6 @@
   HDRS
     "LLVMAOTTargetLinker.h"
   DEPS
-    iree::base::file_io
     iree::compiler::Dialect::HAL::Target::LLVM::internal::LLVMAOTTargetLinker_internal
   PUBLIC
 )
@@ -109,6 +108,6 @@
     "LLVMAOTTargetLinker.h"
   DEPS
     ::LLVMTargetOptions
-    iree::base::file_io
+    iree::base::status
   PUBLIC
 )
diff --git a/iree/compiler/Dialect/HAL/Target/LLVM/LLVMAOTTarget.cpp b/iree/compiler/Dialect/HAL/Target/LLVM/LLVMAOTTarget.cpp
index 7269089..8cb47b5 100644
--- a/iree/compiler/Dialect/HAL/Target/LLVM/LLVMAOTTarget.cpp
+++ b/iree/compiler/Dialect/HAL/Target/LLVM/LLVMAOTTarget.cpp
@@ -65,13 +65,10 @@
     auto executableOp = cast<ExecutableOp>(targetOp.getParentOp());
     auto entryPointOps =
         executableOp.getBlock().getOps<ExecutableEntryPointOp>();
-    const bool addCInterface = true;
+
     for (auto entryPointOp : entryPointOps) {
-      std::string funcName =
-          addCInterface ? "_mlir_ciface_" + std::string(entryPointOp.sym_name())
-                        : std::string(entryPointOp.sym_name());
-      dyLibExecutableDef.entry_points.push_back("invoke_" + funcName);
-      createLLVMInvocationFunc(funcName, llvmModule.get());
+      dyLibExecutableDef.entry_points.push_back(
+          std::string(entryPointOp.sym_name()));
     }
 
     // LLVMIR opt passes.
diff --git a/iree/compiler/Dialect/HAL/Target/LLVM/LLVMAOTTargetLinker.h b/iree/compiler/Dialect/HAL/Target/LLVM/LLVMAOTTargetLinker.h
index 764ad02..669f17c 100644
--- a/iree/compiler/Dialect/HAL/Target/LLVM/LLVMAOTTargetLinker.h
+++ b/iree/compiler/Dialect/HAL/Target/LLVM/LLVMAOTTargetLinker.h
@@ -18,7 +18,7 @@
 
 #include <string>
 
-#include "iree/base/file_io.h"
+#include "iree/base/status.h"
 #include "iree/compiler/Dialect/HAL/Target/LLVM/LLVMTargetOptions.h"
 
 namespace mlir {
diff --git a/iree/compiler/Dialect/HAL/Target/LLVM/LLVMIRPasses.cpp b/iree/compiler/Dialect/HAL/Target/LLVM/LLVMIRPasses.cpp
index cb2a526..e91441d 100644
--- a/iree/compiler/Dialect/HAL/Target/LLVM/LLVMIRPasses.cpp
+++ b/iree/compiler/Dialect/HAL/Target/LLVM/LLVMIRPasses.cpp
@@ -44,44 +44,6 @@
   return machine;
 }
 
-void createLLVMInvocationFunc(const std::string& name, llvm::Module* module) {
-  // TODO(ataei): This is written as a stub in LLVM IR. It would be easier to
-  // have this using MLIR and lower it to LLVM like the dispatch function
-  // implementation is.
-
-  auto& ctx = module->getContext();
-  llvm::IRBuilder<> builder(ctx);
-  auto var_func = module->getFunction(name);
-
-  auto new_type = llvm::FunctionType::get(
-      builder.getVoidTy(), builder.getInt8PtrTy()->getPointerTo(),
-      /*isVarArg=*/false);
-
-  auto new_name = "invoke_" + name;
-  auto func_cst = module->getOrInsertFunction(new_name, new_type);
-  llvm::Function* interface_func =
-      llvm::cast<llvm::Function>(func_cst.getCallee());
-
-  auto bb = llvm::BasicBlock::Create(ctx);
-  bb->insertInto(interface_func);
-  builder.SetInsertPoint(bb);
-  llvm::Value* argList = interface_func->arg_begin();
-  llvm::SmallVector<llvm::Value*, 8> args;
-  args.reserve(llvm::size(var_func->args()));
-  for (auto& indexedArg : llvm::enumerate(var_func->args())) {
-    llvm::Value* arg_index = llvm::Constant::getIntegerValue(
-        builder.getInt64Ty(), llvm::APInt(64, indexedArg.index()));
-    llvm::Value* arg_ptr_ptr = builder.CreateGEP(argList, arg_index);
-    llvm::Value* arg_ptr = builder.CreateLoad(arg_ptr_ptr);
-    arg_ptr = builder.CreateBitCast(
-        arg_ptr, indexedArg.value().getType()->getPointerTo());
-    llvm::Value* arg = builder.CreateLoad(arg_ptr);
-    args.push_back(arg);
-  }
-  builder.CreateCall(var_func, args);
-  builder.CreateRetVoid();
-}
-
 LogicalResult runLLVMIRPasses(const LLVMTargetOptions& options,
                               llvm::TargetMachine* machine,
                               llvm::Module* module) {
@@ -91,7 +53,8 @@
   llvm::ModuleAnalysisManager moduleAnalysisManager;
 
   llvm::PassInstrumentationCallbacks passInstrumentationCallbacks;
-  llvm::StandardInstrumentations standardInstrumentations;
+  llvm::StandardInstrumentations standardInstrumentations(
+      /*DebugLogging=*/false);
   standardInstrumentations.registerCallbacks(passInstrumentationCallbacks);
 
   llvm::PassBuilder passBuilder(machine, options.pipelineTuningOptions, {},
diff --git a/iree/compiler/Dialect/HAL/Target/LLVM/LLVMIRPasses.h b/iree/compiler/Dialect/HAL/Target/LLVM/LLVMIRPasses.h
index 199e36f..37ee1ba 100644
--- a/iree/compiler/Dialect/HAL/Target/LLVM/LLVMIRPasses.h
+++ b/iree/compiler/Dialect/HAL/Target/LLVM/LLVMIRPasses.h
@@ -31,9 +31,6 @@
 std::unique_ptr<llvm::TargetMachine> createTargetMachine(
     const LLVMTargetOptions& options);
 
-// Creates an invocation function in a module for the given function name.
-void createLLVMInvocationFunc(const std::string& name, llvm::Module* module);
-
 // Creates and runs LLVMIR optimization passes defined in LLVMTargetOptions.
 LogicalResult runLLVMIRPasses(const LLVMTargetOptions& options,
                               llvm::TargetMachine* machine,
diff --git a/iree/compiler/Dialect/HAL/Target/LLVM/LLVMIRTarget.cpp b/iree/compiler/Dialect/HAL/Target/LLVM/LLVMIRTarget.cpp
index 98c0bf4..96bb5ac 100644
--- a/iree/compiler/Dialect/HAL/Target/LLVM/LLVMIRTarget.cpp
+++ b/iree/compiler/Dialect/HAL/Target/LLVM/LLVMIRTarget.cpp
@@ -58,13 +58,9 @@
     auto executableOp = cast<IREE::HAL::ExecutableOp>(targetOp.getParentOp());
     auto entryPointOps =
         executableOp.getBlock().getOps<IREE::HAL::ExecutableEntryPointOp>();
-    const bool addCInterface = true;
     for (auto entryPointOp : entryPointOps) {
-      std::string funcName =
-          addCInterface ? "_mlir_ciface_" + std::string(entryPointOp.sym_name())
-                        : std::string(entryPointOp.sym_name());
-      llvmIrExecutableDef.entry_points.push_back(funcName);
-      createLLVMInvocationFunc(funcName, llvmModule.get());
+      llvmIrExecutableDef.entry_points.push_back(
+          std::string(entryPointOp.sym_name()));
     }
 
     // LLVMIR opt passes.
@@ -74,8 +70,9 @@
                          options_.targetTriple);
       return failure();
     }
-    if (failed(
-            runLLVMIRPasses(options_, targetMachine.get(), llvmModule.get()))) {
+    LogicalResult translationResult =
+        runLLVMIRPasses(options_, targetMachine.get(), llvmModule.get());
+    if (failed(translationResult)) {
       return targetOp.emitError(
           "Can't build LLVMIR opt passes for ExecutableOp module");
     }
diff --git a/iree/compiler/Dialect/HAL/Target/LLVM/internal/BUILD b/iree/compiler/Dialect/HAL/Target/LLVM/internal/BUILD
index c3ba845..19c3372 100644
--- a/iree/compiler/Dialect/HAL/Target/LLVM/internal/BUILD
+++ b/iree/compiler/Dialect/HAL/Target/LLVM/internal/BUILD
@@ -24,5 +24,6 @@
     deps = [
         "//iree/base:status",
         "//iree/compiler/Dialect/HAL/Target/LLVM:LLVMAOTTargetLinker_hdrs",
+        "@llvm-project//llvm:Support",
     ],
 )
diff --git a/iree/compiler/Dialect/HAL/Target/LLVM/internal/CMakeLists.txt b/iree/compiler/Dialect/HAL/Target/LLVM/internal/CMakeLists.txt
index 18c9c7a..b91dae5 100644
--- a/iree/compiler/Dialect/HAL/Target/LLVM/internal/CMakeLists.txt
+++ b/iree/compiler/Dialect/HAL/Target/LLVM/internal/CMakeLists.txt
@@ -20,6 +20,7 @@
   SRCS
     "LLVMAOTTargetLinker.cpp"
   DEPS
+    LLVMSupport
     iree::base::status
     iree::compiler::Dialect::HAL::Target::LLVM::LLVMAOTTargetLinker_hdrs
   PUBLIC
diff --git a/iree/compiler/Dialect/HAL/Target/LLVM/internal/LLVMAOTTargetLinker.cpp b/iree/compiler/Dialect/HAL/Target/LLVM/internal/LLVMAOTTargetLinker.cpp
index e5108d6..65d8e22 100644
--- a/iree/compiler/Dialect/HAL/Target/LLVM/internal/LLVMAOTTargetLinker.cpp
+++ b/iree/compiler/Dialect/HAL/Target/LLVM/internal/LLVMAOTTargetLinker.cpp
@@ -15,6 +15,7 @@
 #include "iree/compiler/Dialect/HAL/Target/LLVM/LLVMAOTTargetLinker.h"
 
 #include "iree/base/status.h"
+#include "llvm/Support/ToolOutputFile.h"
 
 namespace mlir {
 namespace iree_compiler {
@@ -23,18 +24,47 @@
 
 iree::StatusOr<std::string> linkLLVMAOTObjects(
     const std::string& linkerToolPath, const std::string& objData) {
-  std::string archiveFile, sharedLibFile;
-  ASSIGN_OR_RETURN(archiveFile, iree::file_io::GetTempFile("objfile"));
-  RETURN_IF_ERROR(iree::file_io::SetFileContents(archiveFile, objData));
-  ASSIGN_OR_RETURN(sharedLibFile, iree::file_io::GetTempFile("dylibfile"));
-  std::string linkingCmd =
-      linkerToolPath + " -shared " + archiveFile + " -o " + sharedLibFile;
+  llvm::SmallString<32> objFilePath, dylibFilePath;
+  if (std::error_code error = llvm::sys::fs::createTemporaryFile(
+          "llvmaot_dylibs", "objfile", objFilePath)) {
+    return iree::InternalErrorBuilder(IREE_LOC)
+           << "Failed to generate temporary file for objfile : '"
+           << error.message() << "'";
+  }
+  if (std::error_code error = llvm::sys::fs::createTemporaryFile(
+          "llvmaot_dylibs", "dylibfile", dylibFilePath)) {
+    return iree::InternalErrorBuilder(IREE_LOC)
+           << "Failed to generate temporary file for dylib : '"
+           << error.message() << "'";
+  }
+  std::error_code error;
+  auto outputFile = std::make_unique<llvm::ToolOutputFile>(
+      objFilePath, error, llvm::sys::fs::F_None);
+  if (error) {
+    return iree::InternalErrorBuilder(IREE_LOC)
+           << "Failed to open temporary objfile '" << objFilePath.c_str()
+           << "' for dylib : '" << error.message() << "'";
+  }
+
+  outputFile->os() << objData;
+  outputFile->os().flush();
+
+  auto linkingCmd =
+      (linkerToolPath + " -shared " + objFilePath + " -o " + dylibFilePath)
+          .str();
   int systemRet = system(linkingCmd.c_str());
   if (systemRet != 0) {
     return iree::InternalErrorBuilder(IREE_LOC)
            << linkingCmd << " failed with exit code " << systemRet;
   }
-  return iree::file_io::GetFileContents(sharedLibFile);
+
+  auto dylibData = llvm::MemoryBuffer::getFile(dylibFilePath);
+  if (!dylibData) {
+    return iree::InternalErrorBuilder(IREE_LOC)
+           << "Failed to read temporary dylib file '" << dylibFilePath.c_str()
+           << "'";
+  }
+  return dylibData.get()->getBuffer().str();
 }
 
 iree::StatusOr<std::string> linkLLVMAOTObjectsWithLLDElf(
diff --git a/iree/compiler/Dialect/HAL/hal.imports.mlir b/iree/compiler/Dialect/HAL/hal.imports.mlir
index 224bb7b..f3d8c62 100644
--- a/iree/compiler/Dialect/HAL/hal.imports.mlir
+++ b/iree/compiler/Dialect/HAL/hal.imports.mlir
@@ -214,6 +214,11 @@
 ) -> (i32, i32, i32, i32)
 attributes {nosideeffects}
 
+// Prints out the content of buffers.
+vm.import @buffer_view.trace(
+  %operands : !vm.ref<!hal.buffer_view> ...
+)
+
 //===----------------------------------------------------------------------===//
 // iree::hal::CommandBuffer
 //===----------------------------------------------------------------------===//
diff --git a/iree/compiler/Dialect/IREE/Transforms/test/drop_compiler_hints.mlir b/iree/compiler/Dialect/IREE/Transforms/test/drop_compiler_hints.mlir
index 9486f96..39c18ce 100644
--- a/iree/compiler/Dialect/IREE/Transforms/test/drop_compiler_hints.mlir
+++ b/iree/compiler/Dialect/IREE/Transforms/test/drop_compiler_hints.mlir
@@ -1,6 +1,6 @@
 // RUN: iree-opt -split-input-file -iree-drop-compiler-hints %s | IreeFileCheck --implicit-check-not="iree.do_not_optimize" %s
 
-// This file is used as an example in docs/developer_overview.md.
+// This file is used as an example in docs/developing_iree/developer_overview.md.
 // If you move or delete it, please update the documentation accordingly.
 
 // CHECK-LABEL: @constant
diff --git a/iree/compiler/Dialect/Shape/Conversion/ConvertShapeToShapex.cpp b/iree/compiler/Dialect/Shape/Conversion/ConvertShapeToShapex.cpp
index 1f2a0e6..decf426 100644
--- a/iree/compiler/Dialect/Shape/Conversion/ConvertShapeToShapex.cpp
+++ b/iree/compiler/Dialect/Shape/Conversion/ConvertShapeToShapex.cpp
@@ -100,8 +100,28 @@
     }
     auto resultType =
         RankedShapeType::get(tensorType.getShape(), rewriter.getContext());
-    rewriter.replaceOpWithNewOp<Shape::GetRankedShapeOp>(op, resultType,
-                                                         operands[0]);
+    // TODO(jpienaar): The following needs to be re-evaluated once the patch
+    // train from 2020/07/23 integrates properly. This is required to make
+    // it forward and backwards compatible. Also, tests need to be added once
+    // upstream integrates (and this can be tested).
+    // rewriter.replaceOpWithNewOp<Shape::GetRankedShapeOp>(op, resultType,
+    //                                                      operands[0]);
+    auto getRanked = rewriter.create<Shape::GetRankedShapeOp>(
+        op.getLoc(), resultType, operands[0]);
+
+    // For FromExtentTensorOp users, just forward the result from GetRanked.
+    SmallPtrSet<Operation *, 2> toDelete;
+    for (auto use : op.getOperation()->getUsers()) {
+      if (isa<FromExtentTensorOp>(use)) {
+        use->replaceAllUsesWith(getRanked);
+        toDelete.insert(use);
+      }
+    }
+    for (Operation *use : toDelete) {
+      rewriter.eraseOp(use);
+    }
+
+    rewriter.replaceOp(op.getOperation(), getRanked.getResult());
     return success();
   }
 };
diff --git a/iree/compiler/Dialect/Shape/Conversion/test/shape_to_shapex.mlir b/iree/compiler/Dialect/Shape/Conversion/test/shape_to_shapex.mlir
index 5017c18..ea613c9 100644
--- a/iree/compiler/Dialect/Shape/Conversion/test/shape_to_shapex.mlir
+++ b/iree/compiler/Dialect/Shape/Conversion/test/shape_to_shapex.mlir
@@ -5,8 +5,8 @@
 // CHECK-LABEL: func @f
 func @f(%arg0: tensor<?xf32>) {
   // CHECK: shapex.const_ranked_shape : !shapex.ranked_shape<[1,2,3]>
-  %0 = shape.const_shape [1, 2, 3]
-  "foo.use"(%0) : (!shape.shape) -> ()
+  %0 = shape.const_shape [1, 2, 3] : tensor<?xindex>
+  "foo.use"(%0) : (tensor<?xindex>) -> ()
   return
 }
 
diff --git a/iree/compiler/Dialect/Shape/IR/ShapeInterface.h b/iree/compiler/Dialect/Shape/IR/ShapeInterface.h
index a200bfd..b36890e 100644
--- a/iree/compiler/Dialect/Shape/IR/ShapeInterface.h
+++ b/iree/compiler/Dialect/Shape/IR/ShapeInterface.h
@@ -58,7 +58,10 @@
   }
 
   template <typename BuilderTy, typename... ConstructorArgs>
-  BuilderTy &make(ConstructorArgs &&... args) {
+  // TODO(suderman): Re-enable clang-format when new version migrates.
+  // clang-format off
+  BuilderTy &make(ConstructorArgs &&...args) {
+    // clang-format on
     auto instance =
         std::make_unique<BuilderTy>(std::forward<ConstructorArgs>(args)...);
     BuilderTy *unowned = instance.get();
diff --git a/iree/compiler/Dialect/Shape/IR/ShapeOps.cpp b/iree/compiler/Dialect/Shape/IR/ShapeOps.cpp
index b81c600..de5a06f 100644
--- a/iree/compiler/Dialect/Shape/IR/ShapeOps.cpp
+++ b/iree/compiler/Dialect/Shape/IR/ShapeOps.cpp
@@ -321,7 +321,9 @@
     SmallVectorImpl<Type> &inferredReturnTypes) {
   auto inputType = operands[0].getType().dyn_cast<RankedTensorType>();
   if (!inputType || !isValidTensorOfExtents(inputType)) {
-    return failure();
+    return emitOptionalError(location, "Invalid input type, ",
+                             operands[0].getType(),
+                             ", for from_extent_tensor op");
   }
   SmallVector<int64_t, 6> extents(inputType.getDimSize(0),
                                   static_cast<int64_t>(-1));
diff --git a/iree/compiler/Dialect/Shape/Plugins/XLA/XlaHloShapeBuilder.cpp b/iree/compiler/Dialect/Shape/Plugins/XLA/XlaHloShapeBuilder.cpp
index 8d37751..9024e88 100644
--- a/iree/compiler/Dialect/Shape/Plugins/XLA/XlaHloShapeBuilder.cpp
+++ b/iree/compiler/Dialect/Shape/Plugins/XLA/XlaHloShapeBuilder.cpp
@@ -308,6 +308,130 @@
   return builder.create<MakeRankedShapeOp>(loc, resultShape, dynamicDims);
 }
 
+Value rewriteTorchIndexSelect(RankedShapeType resultShape,
+                              TorchIndexSelectOp torchIndexSelectOp,
+                              OpBuilder &builder) {
+  if (!torchIndexSelectOp) return nullptr;
+  auto loc = torchIndexSelectOp.getLoc();
+
+  int64_t resultShapeRank = resultShape.getRank();
+  auto paramsType =
+      torchIndexSelectOp.input().getType().dyn_cast<RankedTensorType>();
+  auto indicesType =
+      torchIndexSelectOp.index().getType().dyn_cast<RankedTensorType>();
+  if (!paramsType || !indicesType) {
+    return nullptr;
+  }
+
+  auto axis = torchIndexSelectOp.dim();
+  auto batchDim = torchIndexSelectOp.batch_dims();
+  int64_t paramsRank = paramsType.getRank();
+  int64_t indicesRank = indicesType.getRank();
+
+  std::vector<int64_t> shape(paramsType.getShape());
+  int64_t axisValue = axis.getSExtValue();
+  int64_t batchDimValue = batchDim.getSExtValue();
+
+  // For neg axis values, we wrap around params,
+  // e.g. axis = -1 => params[:-1]
+  if (axisValue < 0) {
+    axisValue += paramsRank;
+  }
+  if (batchDimValue < 0) {
+    batchDimValue += indicesRank;
+  }
+
+  // params must be at least rank axis + 1
+  if (paramsRank < axisValue + 1) {
+    return nullptr;
+  }
+
+  auto paramsShapeValue = builder.create<GetRankedShapeOp>(
+      loc, RankedShapeType::get(paramsType.getShape(), builder.getContext()),
+      torchIndexSelectOp.input());
+  auto indicesShapeValue = builder.create<GetRankedShapeOp>(
+      loc, RankedShapeType::get(indicesType.getShape(), builder.getContext()),
+      torchIndexSelectOp.index());
+
+  SmallVector<Value, 4> dynamicDims;
+#define GENERATE_RANKED_DIM_OP(value, index)                                   \
+  do {                                                                         \
+    auto dimValue = builder.create<RankedDimOp>(                               \
+        loc, builder.getIndexType(), value, builder.getI64IntegerAttr(index)); \
+    dynamicDims.push_back(dimValue);                                           \
+  } while (0)
+
+  if (indicesRank == 0) {
+    // Scalar indices (output is rank(params) - 1).
+    if (resultShapeRank != paramsRank - 1) {
+      return nullptr;
+    }
+
+    // params.shape[:axis] + params.shape[axis+1:]
+    for (int64_t i = 0; i < paramsRank; ++i) {
+      if ((i == axisValue) || (i < axisValue && !resultShape.isDimDynamic(i)) ||
+          (i > axisValue && !resultShape.isDimDynamic(i - 1)))
+        continue;
+      GENERATE_RANKED_DIM_OP(paramsShapeValue, i);
+    }
+  } else if (indicesRank == 1) {
+    // Vector indices (output is rank(params)).
+    // Copy indices.shape into params.shape[axis]
+    if (resultShapeRank != paramsRank) {
+      return nullptr;
+    }
+
+    // params.shape[:axis] + indices.shape[batch_dims:]
+    //   + params.shape[indicesRank-batchDim+axisValue:]
+    int resultShapeIndex = 0;
+    // params.shape[:axis]
+    for (int64_t i = 0; i < axisValue; ++i) {
+      if (!resultShape.isDimDynamic(resultShapeIndex++)) continue;
+      GENERATE_RANKED_DIM_OP(paramsShapeValue, i);
+    }
+    // indices.shape[:batchDim]
+    for (int64_t i = batchDimValue;
+         i < indicesRank && resultShapeIndex < resultShapeRank; ++i) {
+      if (!resultShape.isDimDynamic(resultShapeIndex++)) continue;
+      GENERATE_RANKED_DIM_OP(indicesShapeValue, i);
+    }
+    // params.shape[indicesRank-batchDim+axisValue:]
+    // resultShapeIndex == indicesRank-batchDim+axisValue
+    for (int64_t i = resultShapeIndex; i < resultShapeRank; ++i) {
+      if (!resultShape.isDimDynamic(resultShapeIndex++)) continue;
+      GENERATE_RANKED_DIM_OP(paramsShapeValue, i);
+    }
+  } else {
+    // params.shape[:axis] + indices.shape[batch_dims:] + params.shape[axis +
+    // 1:]
+    // The expected rank is (paramsRank-1) + (indicesRank-batchDim)
+    auto expectedRank = paramsRank - 1 + indicesRank - batchDimValue;
+    if (resultShapeRank != expectedRank) {
+      return nullptr;
+    }
+
+    int resultShapeIndex = 0;
+    for (int64_t i = 0; i < axisValue; ++i) {
+      if (!resultShape.isDimDynamic(resultShapeIndex++)) continue;
+      GENERATE_RANKED_DIM_OP(paramsShapeValue, i);
+    }
+
+    for (int64_t i = batchDimValue; i < indicesRank; ++i) {
+      if (!resultShape.isDimDynamic(resultShapeIndex++)) continue;
+      GENERATE_RANKED_DIM_OP(indicesShapeValue, i);
+    }
+
+    for (int64_t i = axisValue + 1;
+         i < paramsRank && resultShapeIndex < resultShapeRank; ++i) {
+      if (!resultShape.isDimDynamic(resultShapeIndex++)) continue;
+      GENERATE_RANKED_DIM_OP(paramsShapeValue, i);
+    }
+  }
+#undef GENERATE_RANKED_DIM_OP
+
+  return builder.create<MakeRankedShapeOp>(loc, resultShape, dynamicDims);
+}
+
 }  // namespace
 
 // Creates a custom op shape builder for XLA-HLO ops that are not otherwise
@@ -340,6 +464,8 @@
   b.insertOpRankedShapeBuilder<TransposeOp>(rewriteTranspose);
   b.insertOpRankedShapeBuilder<mhlo::DotGeneralOp>(rewriteDotGeneral);
   b.insertOpRankedShapeBuilder<mhlo::DynamicReshapeOp>(rewriteDynamicReshape);
+  b.insertOpRankedShapeBuilder<mhlo::TorchIndexSelectOp>(
+      rewriteTorchIndexSelect);
 }
 
 }  // namespace mhlo
diff --git a/iree/compiler/Dialect/VM/Conversion/TypeConverter.cpp b/iree/compiler/Dialect/VM/Conversion/TypeConverter.cpp
index 668442f..de6e351 100644
--- a/iree/compiler/Dialect/VM/Conversion/TypeConverter.cpp
+++ b/iree/compiler/Dialect/VM/Conversion/TypeConverter.cpp
@@ -80,13 +80,12 @@
       });
 
   // TODO(b/145876978): materialize conversion for other types
-  addMaterialization([](PatternRewriter &rewriter,
-                        Shape::RankedShapeType resultType, ValueRange inputs,
-                        Location loc) -> Optional<Value> {
+  addArgumentMaterialization([](OpBuilder &builder,
+                                Shape::RankedShapeType resultType,
+                                ValueRange inputs, Location loc) -> Value {
     LLVM_DEBUG(llvm::dbgs()
                << "MATERIALIZE CONVERSION: " << resultType << "\n");
-    return rewriter.create<Shape::MakeRankedShapeOp>(loc, resultType, inputs)
-        .getResult();
+    return builder.create<Shape::MakeRankedShapeOp>(loc, resultType, inputs);
   });
 }
 
diff --git a/kokoro/gcp_ubuntu/cmake/continuous.cfg b/iree/compiler/Dialect/VM/Conversion/VMToEmitC/CMakeLists.txt
similarity index 64%
copy from kokoro/gcp_ubuntu/cmake/continuous.cfg
copy to iree/compiler/Dialect/VM/Conversion/VMToEmitC/CMakeLists.txt
index e4cc270..f4f29c9 100644
--- a/kokoro/gcp_ubuntu/cmake/continuous.cfg
+++ b/iree/compiler/Dialect/VM/Conversion/VMToEmitC/CMakeLists.txt
@@ -1,5 +1,3 @@
-# Format: //devtools/kokoro/config/proto/build.proto
-
 # Copyright 2020 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,6 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Deliberately blank as everything necessary is configured in common files, but
-# file must still exist to match corresponding (Google internal) job
-# configurations that trigger the builds.
+if(${IREE_ENABLE_EMITC})
+  iree_add_all_subdirs()
+  
+  iree_cc_library(
+    NAME
+      VMToEmitC
+    HDRS
+      "ConvertVMToEmitC.h"
+    SRCS
+      "ConvertVMToEmitC.cpp"
+    DEPS
+      MLIRIR
+      MLIRPass
+      MLIREmitC
+      MLIRTransforms
+      iree::compiler::Dialect::VM::IR
+    PUBLIC
+  )
+endif()
diff --git a/iree/compiler/Dialect/VM/Conversion/VMToEmitC/ConvertVMToEmitC.cpp b/iree/compiler/Dialect/VM/Conversion/VMToEmitC/ConvertVMToEmitC.cpp
new file mode 100644
index 0000000..1d6d96e
--- /dev/null
+++ b/iree/compiler/Dialect/VM/Conversion/VMToEmitC/ConvertVMToEmitC.cpp
@@ -0,0 +1,108 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/compiler/Dialect/VM/Conversion/VMToEmitC/ConvertVMToEmitC.h"
+
+#include "emitc/Dialect/EmitC/EmitCDialect.h"
+#include "iree/compiler/Dialect/VM/IR/VMOps.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/Module.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+namespace mlir {
+namespace iree_compiler {
+
+namespace {
+
+// Taken over from StandardToVM.
+// We need to replace the Op depending on the operand.
+// We could start with a conversion for IREE::VM::AddI32Op
+template <typename SrcOpTy, typename DstOpTy>
+class BinaryArithmeticOpConversion : public OpConversionPattern<SrcOpTy> {
+  using OpConversionPattern<SrcOpTy>::OpConversionPattern;
+
+ public:
+  BinaryArithmeticOpConversion(MLIRContext *context, StringRef funcName)
+      : OpConversionPattern<SrcOpTy>(context), funcName(funcName) {}
+
+ private:
+  LogicalResult matchAndRewrite(
+      SrcOpTy srcOp, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const override {
+    typename SrcOpTy::Adaptor srcAdapter(operands);
+
+    StringAttr callee = rewriter.getStringAttr(funcName);
+    ArrayAttr args =
+        rewriter.getArrayAttr({IntegerAttr::get(rewriter.getIndexType(), 0),
+                               IntegerAttr::get(rewriter.getIndexType(), 1)});
+    ValueRange dstOperands{srcAdapter.lhs(), srcAdapter.rhs()};
+
+    rewriter.replaceOpWithNewOp<DstOpTy>(srcOp, srcAdapter.lhs().getType(),
+                                         callee, args, dstOperands);
+
+    return success();
+  }
+
+  StringRef funcName;
+};
+
+}  // namespace
+
+void populateVMToCPatterns(MLIRContext *context,
+                           OwningRewritePatternList &patterns) {
+  patterns.insert<
+      BinaryArithmeticOpConversion<IREE::VM::AddI32Op, mlir::emitc::CallOp>>(
+      context, "vm_add_i32");
+}
+
+namespace IREE {
+namespace VM {
+
+namespace {
+
+// A pass converting IREE VM operations into the EmitC dialect.
+class ConvertVMToEmitCPass
+    : public PassWrapper<ConvertVMToEmitCPass,
+                         OperationPass<IREE::VM::ModuleOp>> {
+  void runOnOperation() {
+    ConversionTarget target(getContext());
+
+    OwningRewritePatternList patterns;
+    populateVMToCPatterns(&getContext(), patterns);
+
+    target.addLegalDialect<mlir::emitc::EmitCDialect>();
+    target.addLegalDialect<IREE::VM::VMDialect>();
+    target.addIllegalOp<IREE::VM::AddI32Op>();
+
+    if (failed(applyFullConversion(getOperation(), target, patterns))) {
+      return signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<IREE::VM::ModuleOp>>
+createConvertVMToEmitCPass() {
+  return std::make_unique<ConvertVMToEmitCPass>();
+}
+
+}  // namespace VM
+}  // namespace IREE
+
+static PassRegistration<IREE::VM::ConvertVMToEmitCPass> pass(
+    "iree-convert-vm-to-emitc", "Convert VM Ops to the EmitC dialect");
+
+}  // namespace iree_compiler
+}  // namespace mlir
diff --git a/iree/compiler/Dialect/VM/Conversion/VMToEmitC/ConvertVMToEmitC.h b/iree/compiler/Dialect/VM/Conversion/VMToEmitC/ConvertVMToEmitC.h
new file mode 100644
index 0000000..a1bc545
--- /dev/null
+++ b/iree/compiler/Dialect/VM/Conversion/VMToEmitC/ConvertVMToEmitC.h
@@ -0,0 +1,38 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_COMPILER_DIALECT_VM_CONVERSION_VMTOEMITC_CONVERTVMTOEMITC_H_
+#define IREE_COMPILER_DIALECT_VM_CONVERSION_VMTOEMITC_CONVERTVMTOEMITC_H_
+
+#include "iree/compiler/Dialect/VM/IR/VMOps.h"
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+namespace iree_compiler {
+
+void populateVMToCPatterns(MLIRContext *context,
+                           OwningRewritePatternList &patterns);
+
+namespace IREE {
+namespace VM {
+
+std::unique_ptr<OperationPass<IREE::VM::ModuleOp>> createConvertVMToEmitCPass();
+
+}  // namespace VM
+}  // namespace IREE
+
+}  // namespace iree_compiler
+}  // namespace mlir
+
+#endif  // IREE_COMPILER_DIALECT_VM_CONVERSION_VMTOEMITC_CONVERTVMTOEMITC_H_
diff --git a/kokoro/gcp_ubuntu/cmake/continuous.cfg b/iree/compiler/Dialect/VM/Conversion/VMToEmitC/test/CMakeLists.txt
similarity index 68%
copy from kokoro/gcp_ubuntu/cmake/continuous.cfg
copy to iree/compiler/Dialect/VM/Conversion/VMToEmitC/test/CMakeLists.txt
index e4cc270..fcc538b 100644
--- a/kokoro/gcp_ubuntu/cmake/continuous.cfg
+++ b/iree/compiler/Dialect/VM/Conversion/VMToEmitC/test/CMakeLists.txt
@@ -1,5 +1,3 @@
-# Format: //devtools/kokoro/config/proto/build.proto
-
 # Copyright 2020 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,6 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Deliberately blank as everything necessary is configured in common files, but
-# file must still exist to match corresponding (Google internal) job
-# configurations that trigger the builds.
+iree_add_all_subdirs()
+
+file(GLOB _GLOB_X_MLIR LIST_DIRECTORIES false RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} CONFIGURE_DEPENDS *.mlir)
+iree_lit_test_suite(
+  NAME
+    lit
+  SRCS
+    "${_GLOB_X_MLIR}"
+  DATA
+    iree::tools::IreeFileCheck
+    iree::tools::iree-opt
+)
diff --git a/iree/compiler/Dialect/VM/Conversion/VMToEmitC/test/add.mlir b/iree/compiler/Dialect/VM/Conversion/VMToEmitC/test/add.mlir
new file mode 100644
index 0000000..a693fe5
--- /dev/null
+++ b/iree/compiler/Dialect/VM/Conversion/VMToEmitC/test/add.mlir
@@ -0,0 +1,12 @@
+// RUN: iree-opt -split-input-file -pass-pipeline='iree-convert-vm-to-emitc' %s | IreeFileCheck %s
+
+// CHECK: vm.module @add_module {
+vm.module @add_module {
+  // CHECK-NEXT: vm.func @add_i32(%arg0: i32, %arg1: i32) {
+  vm.func @add_i32(%arg0: i32, %arg1: i32) {
+    // CHECK-NEXT: %0 = emitc.call "vm_add_i32"(%arg0, %arg1) {args = [0 : index, 1 : index]} : (i32, i32) -> i32
+    %0 = vm.add.i32 %arg0, %arg1 : i32
+    // CHECK-NEXT: vm.return
+    vm.return
+  }
+}
diff --git a/iree/compiler/Dialect/VM/IR/VMBase.td b/iree/compiler/Dialect/VM/IR/VMBase.td
index 12fb1a5..9243cef 100644
--- a/iree/compiler/Dialect/VM/IR/VMBase.td
+++ b/iree/compiler/Dialect/VM/IR/VMBase.td
@@ -56,8 +56,8 @@
 // VM opcodes
 //===----------------------------------------------------------------------===//
 // Opcode ranges:
-// 0x00-0x7F: core VM opcodes, reserved for this dialect
-// 0x80-0xFF: unreserved, used by target-specific ops (like SIMD)
+// 0x00-0x9F: core VM opcodes, reserved for this dialect
+// 0xA0-0xFF: unreserved, used to prefix extension op sets
 //
 // Note that changing existing opcode assignments will invalidate all binaries
 // and should only be done when breaking changes are acceptable. We could add a
@@ -67,9 +67,28 @@
 // Some opcodes require an extension prefix to indicate that runtime support
 // is optional. An op with the ExtI64 trait will require VM_OPC_ExtI64, for
 // example. Ops that bridge extension sets have a canonical form that may
-// require multiple prefix codes.
+// require multiple prefix codes (for example, the i64<->f64 extensions).
 
-class VM_OPC<int opcode, string name> : I32EnumAttrCase<name, opcode>;
+class VM_OPC<int opcode, string name> :
+    IntEnumAttrCaseBase<I8, name, name, opcode>;
+
+class VM_OPC_EnumAttr<string name, string enumName, string enumTag,
+                      string description,
+                      VM_OPC prefix = ?,
+                      list<VM_OPC> cases> :
+    IntEnumAttr<I8, name, description, cases> {
+  let cppNamespace = "IREE::VM";
+  let returnType = cppNamespace # "::" # name;
+  let underlyingType = "uint8_t";
+  let convertFromStorage = "static_cast<" # returnType # ">($_self.getInt())";
+  let constBuilderCall =
+          "$_builder.getI8IntegerAttr(static_cast<int8_t>($0))";
+
+  // Used by VMOpTableGen:
+  string opcodeEnumName = enumName;
+  VM_OPC opcodePrefix = prefix;
+  string opcodeEnumTag = enumTag;
+}
 
 // Globals:
 def VM_OPC_GlobalLoadI32         : VM_OPC<0x00, "GlobalLoadI32">;
@@ -131,7 +150,9 @@
 def VM_OPC_TruncI32I8            : VM_OPC<0x31, "TruncI32I8">;
 def VM_OPC_TruncI32I16           : VM_OPC<0x32, "TruncI32I16">;
 def VM_OPC_ExtI8I32S             : VM_OPC<0x33, "ExtI8I32S">;
-def VM_OPC_ExtI16I32S            : VM_OPC<0x34, "ExtI16I32S">;
+def VM_OPC_ExtI8I32U             : VM_OPC<0x34, "ExtI8I32U">;
+def VM_OPC_ExtI16I32S            : VM_OPC<0x35, "ExtI16I32S">;
+def VM_OPC_ExtI16I32U            : VM_OPC<0x36, "ExtI16I32U">;
 
 // Reduction arithmetic:
 
@@ -140,12 +161,6 @@
 def VM_OPC_CmpNEI32              : VM_OPC<0x41, "CmpNEI32">;
 def VM_OPC_CmpLTI32S             : VM_OPC<0x42, "CmpLTI32S">;
 def VM_OPC_CmpLTI32U             : VM_OPC<0x43, "CmpLTI32U">;
-def VM_OPC_CmpLTEI32S            : VM_OPC<0x44, "CmpLTEI32S">;
-def VM_OPC_CmpLTEI32U            : VM_OPC<0x45, "CmpLTEI32U">;
-def VM_OPC_CmpGTI32S             : VM_OPC<0x46, "CmpGTI32S">;
-def VM_OPC_CmpGTI32U             : VM_OPC<0x47, "CmpGTI32U">;
-def VM_OPC_CmpGTEI32S            : VM_OPC<0x48, "CmpGTEI32S">;
-def VM_OPC_CmpGTEI32U            : VM_OPC<0x49, "CmpGTEI32U">;
 def VM_OPC_CmpNZI32              : VM_OPC<0x4D, "CmpNZI32">;
 def VM_OPC_CmpEQRef              : VM_OPC<0x4A, "CmpEQRef">;
 def VM_OPC_CmpNERef              : VM_OPC<0x4B, "CmpNERef">;
@@ -169,9 +184,175 @@
 def VM_OPC_Break                 : VM_OPC<0x7F, "Break">;
 
 // Extension prefixes:
-def VM_OPC_ExtI64                : VM_OPC<0x80, "ExtI64">;
-def VM_OPC_ExtF32                : VM_OPC<0x81, "ExtF32">;
-def VM_OPC_ExtF64                : VM_OPC<0x82, "ExtF64">;
+def VM_OPC_PrefixExtI64          : VM_OPC<0xA0, "PrefixExtI64">;
+def VM_OPC_PrefixExtF32          : VM_OPC<0xA1, "PrefixExtF32">;
+def VM_OPC_PrefixExtF64          : VM_OPC<0xA2, "PrefixExtF64">;
+
+// Runtime enum iree_vm_core_op_t:
+def VM_CoreOpcodeAttr :
+    VM_OPC_EnumAttr<"Opcode",
+                    "iree_vm_core_op_t",
+                    "CORE",  // IREE_VM_OP_CORE_*
+                    "valid VM core operation encodings",
+                    ?, [
+    // Core VM opcodes (0x00-0x9F):
+    VM_OPC_GlobalLoadI32,
+    VM_OPC_GlobalStoreI32,
+    VM_OPC_GlobalLoadIndirectI32,
+    VM_OPC_GlobalStoreIndirectI32,
+    VM_OPC_GlobalLoadRef,
+    VM_OPC_GlobalStoreRef,
+    VM_OPC_GlobalLoadIndirectRef,
+    VM_OPC_GlobalStoreIndirectRef,
+    VM_OPC_ConstI32Zero,
+    VM_OPC_ConstI32,
+    VM_OPC_ConstRefZero,
+    VM_OPC_ConstRefRodata,
+    VM_OPC_ListAlloc,
+    VM_OPC_ListReserve,
+    VM_OPC_ListSize,
+    VM_OPC_ListResize,
+    VM_OPC_ListGetI32,
+    VM_OPC_ListSetI32,
+    VM_OPC_ListGetRef,
+    VM_OPC_ListSetRef,
+    VM_OPC_SelectI32,
+    VM_OPC_SelectRef,
+    VM_OPC_SwitchI32,
+    VM_OPC_SwitchRef,
+    VM_OPC_AddI32,
+    VM_OPC_SubI32,
+    VM_OPC_MulI32,
+    VM_OPC_DivI32S,
+    VM_OPC_DivI32U,
+    VM_OPC_RemI32S,
+    VM_OPC_RemI32U,
+    VM_OPC_NotI32,
+    VM_OPC_AndI32,
+    VM_OPC_OrI32,
+    VM_OPC_XorI32,
+    VM_OPC_ShlI32,
+    VM_OPC_ShrI32S,
+    VM_OPC_ShrI32U,
+    VM_OPC_TruncI32I8,
+    VM_OPC_TruncI32I16,
+    VM_OPC_ExtI8I32S,
+    VM_OPC_ExtI8I32U,
+    VM_OPC_ExtI16I32S,
+    VM_OPC_ExtI16I32U,
+    VM_OPC_CmpEQI32,
+    VM_OPC_CmpNEI32,
+    VM_OPC_CmpLTI32S,
+    VM_OPC_CmpLTI32U,
+    VM_OPC_CmpNZI32,
+    VM_OPC_CmpEQRef,
+    VM_OPC_CmpNERef,
+    VM_OPC_CmpNZRef,
+    VM_OPC_Branch,
+    VM_OPC_CondBranch,
+    VM_OPC_Call,
+    VM_OPC_CallVariadic,
+    VM_OPC_Return,
+    VM_OPC_Fail,
+    VM_OPC_Yield,
+    VM_OPC_Trace,
+    VM_OPC_Print,
+    VM_OPC_CondBreak,
+    VM_OPC_Break,
+
+    // Extension opcodes (0xA0-0xFF):
+    VM_OPC_PrefixExtI64,  // VM_ExtI64OpcodeAttr
+    VM_OPC_PrefixExtF32,  // VM_ExtF32OpcodeAttr
+    VM_OPC_PrefixExtF64,  // VM_ExtF64OpcodeAttr
+  ]>;
+
+// i64 extension:
+// (ops are encoded as a VM_OPC_ExtI64 + the opcode below)
+def VM_OPC_GlobalLoadI64         : VM_OPC<0x00, "GlobalLoadI64">;
+def VM_OPC_GlobalStoreI64        : VM_OPC<0x01, "GlobalStoreI64">;
+def VM_OPC_GlobalLoadIndirectI64 : VM_OPC<0x02, "GlobalLoadIndirectI64">;
+def VM_OPC_GlobalStoreIndirectI64: VM_OPC<0x03, "GlobalStoreIndirectI64">;
+def VM_OPC_ConstI64Zero          : VM_OPC<0x08, "ConstI64Zero">;
+def VM_OPC_ConstI64              : VM_OPC<0x09, "ConstI64">;
+def VM_OPC_ListGetI64            : VM_OPC<0x14, "ListGetI64">;
+def VM_OPC_ListSetI64            : VM_OPC<0x15, "ListSetI64">;
+def VM_OPC_SelectI64             : VM_OPC<0x1E, "SelectI64">;
+def VM_OPC_SwitchI64             : VM_OPC<0x20, "SwitchI64">;
+def VM_OPC_AddI64                : VM_OPC<0x22, "AddI64">;
+def VM_OPC_SubI64                : VM_OPC<0x23, "SubI64">;
+def VM_OPC_MulI64                : VM_OPC<0x24, "MulI64">;
+def VM_OPC_DivI64S               : VM_OPC<0x25, "DivI64S">;
+def VM_OPC_DivI64U               : VM_OPC<0x26, "DivI64U">;
+def VM_OPC_RemI64S               : VM_OPC<0x27, "RemI64S">;
+def VM_OPC_RemI64U               : VM_OPC<0x28, "RemI64U">;
+def VM_OPC_NotI64                : VM_OPC<0x29, "NotI64">;
+def VM_OPC_AndI64                : VM_OPC<0x2A, "AndI64">;
+def VM_OPC_OrI64                 : VM_OPC<0x2B, "OrI64">;
+def VM_OPC_XorI64                : VM_OPC<0x2C, "XorI64">;
+def VM_OPC_ShlI64                : VM_OPC<0x2D, "ShlI64">;
+def VM_OPC_ShrI64S               : VM_OPC<0x2E, "ShrI64S">;
+def VM_OPC_ShrI64U               : VM_OPC<0x2F, "ShrI64U">;
+def VM_OPC_TruncI64I8            : VM_OPC<0x30, "TruncI64I8">;
+def VM_OPC_TruncI64I16           : VM_OPC<0x31, "TruncI64I16">;
+def VM_OPC_TruncI64I32           : VM_OPC<0x32, "TruncI64I32">;
+def VM_OPC_ExtI8I64S             : VM_OPC<0x33, "ExtI8I64S">;
+def VM_OPC_ExtI8I64U             : VM_OPC<0x34, "ExtI8I64U">;
+def VM_OPC_ExtI16I64S            : VM_OPC<0x35, "ExtI16I64S">;
+def VM_OPC_ExtI16I64U            : VM_OPC<0x36, "ExtI16I64U">;
+def VM_OPC_ExtI32I64S            : VM_OPC<0x37, "ExtI32I64S">;
+def VM_OPC_ExtI32I64U            : VM_OPC<0x38, "ExtI32I64U">;
+def VM_OPC_CmpEQI64              : VM_OPC<0x40, "CmpEQI64">;
+def VM_OPC_CmpNEI64              : VM_OPC<0x41, "CmpNEI64">;
+def VM_OPC_CmpLTI64S             : VM_OPC<0x42, "CmpLTI64S">;
+def VM_OPC_CmpLTI64U             : VM_OPC<0x43, "CmpLTI64U">;
+def VM_OPC_CmpNZI64              : VM_OPC<0x4D, "CmpNZI64">;
+
+// Runtime enum iree_vm_ext_i64_op_t:
+def VM_ExtI64OpcodeAttr :
+    VM_OPC_EnumAttr<"ExtI64Opcode",
+                    "iree_vm_ext_i64_op_t",
+                    "EXT_I64",  // IREE_VM_OP_EXT_I64_*
+                    "valid VM operation encodings in the i64 extension",
+                    VM_OPC_PrefixExtI64, [
+    VM_OPC_GlobalLoadI64,
+    VM_OPC_GlobalStoreI64,
+    VM_OPC_GlobalLoadIndirectI64,
+    VM_OPC_GlobalStoreIndirectI64,
+    VM_OPC_ConstI64Zero,
+    VM_OPC_ConstI64,
+    VM_OPC_ListGetI64,
+    VM_OPC_ListSetI64,
+    VM_OPC_SelectI64,
+    VM_OPC_SwitchI64,
+    VM_OPC_AddI64,
+    VM_OPC_SubI64,
+    VM_OPC_MulI64,
+    VM_OPC_DivI64S,
+    VM_OPC_DivI64U,
+    VM_OPC_RemI64S,
+    VM_OPC_RemI64U,
+    VM_OPC_NotI64,
+    VM_OPC_AndI64,
+    VM_OPC_OrI64,
+    VM_OPC_XorI64,
+    VM_OPC_ShlI64,
+    VM_OPC_ShrI64S,
+    VM_OPC_ShrI64U,
+    VM_OPC_TruncI64I8,
+    VM_OPC_TruncI64I16,
+    VM_OPC_TruncI64I32,
+    VM_OPC_ExtI8I64S,
+    VM_OPC_ExtI8I64U,
+    VM_OPC_ExtI16I64S,
+    VM_OPC_ExtI16I64U,
+    VM_OPC_ExtI32I64S,
+    VM_OPC_ExtI32I64U,
+    VM_OPC_CmpEQI64,
+    VM_OPC_CmpNEI64,
+    VM_OPC_CmpLTI64S,
+    VM_OPC_CmpLTI64U,
+    VM_OPC_CmpNZI64,
+  ]>;
 
 //===----------------------------------------------------------------------===//
 // Declarative encoding framework
diff --git a/iree/compiler/Dialect/VM/IR/VMOpFolders.cpp b/iree/compiler/Dialect/VM/IR/VMOpFolders.cpp
index 521ef06..b3ceb17 100644
--- a/iree/compiler/Dialect/VM/IR/VMOpFolders.cpp
+++ b/iree/compiler/Dialect/VM/IR/VMOpFolders.cpp
@@ -605,11 +605,21 @@
       operands, [&](APInt a) { return a.trunc(8).sext(32); });
 }
 
+OpFoldResult ExtI8I32UOp::fold(ArrayRef<Attribute> operands) {
+  return constFoldUnaryOp<IntegerAttr>(
+      operands, [&](APInt a) { return a.trunc(8).zext(32); });
+}
+
 OpFoldResult ExtI16I32SOp::fold(ArrayRef<Attribute> operands) {
   return constFoldUnaryOp<IntegerAttr>(
       operands, [&](APInt a) { return a.trunc(16).sext(32); });
 }
 
+OpFoldResult ExtI16I32UOp::fold(ArrayRef<Attribute> operands) {
+  return constFoldUnaryOp<IntegerAttr>(
+      operands, [&](APInt a) { return a.trunc(16).zext(32); });
+}
+
 //===----------------------------------------------------------------------===//
 // Native reduction (horizontal) arithmetic
 //===----------------------------------------------------------------------===//
@@ -706,9 +716,7 @@
 }
 
 void CmpLTI32SOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
-                                              MLIRContext *context) {
-  results.insert<SwapInvertedCmpOps<CmpLTI32SOp, CmpGTEI32SOp>>(context);
-}
+                                              MLIRContext *context) {}
 
 OpFoldResult CmpLTI32UOp::fold(ArrayRef<Attribute> operands) {
   if (lhs() == rhs()) {
@@ -720,9 +728,27 @@
 }
 
 void CmpLTI32UOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
-                                              MLIRContext *context) {
-  results.insert<SwapInvertedCmpOps<CmpLTI32UOp, CmpGTEI32UOp>>(context);
-}
+                                              MLIRContext *context) {}
+
+namespace {
+
+/// Rewrites a vm.cmp.lte.* pseudo op to a vm.cmp.lt.* op.
+template <typename T, typename U>
+struct RewritePseudoCmpLTEToLT : public OpRewritePattern<T> {
+  using OpRewritePattern<T>::OpRewritePattern;
+  LogicalResult matchAndRewrite(T op,
+                                PatternRewriter &rewriter) const override {
+    // !(lhs > rhs)
+    auto condValue =
+        rewriter.createOrFold<U>(op.getLoc(), op.getType(), op.rhs(), op.lhs());
+    rewriter.replaceOpWithNewOp<XorI32Op>(
+        op, op.getType(), condValue,
+        rewriter.createOrFold<IREE::VM::ConstI32Op>(op.getLoc(), 1));
+    return success();
+  }
+};
+
+}  // namespace
 
 OpFoldResult CmpLTEI32SOp::fold(ArrayRef<Attribute> operands) {
   if (lhs() == rhs()) {
@@ -736,6 +762,7 @@
 void CmpLTEI32SOp::getCanonicalizationPatterns(
     OwningRewritePatternList &results, MLIRContext *context) {
   results.insert<SwapInvertedCmpOps<CmpLTEI32SOp, CmpGTI32SOp>>(context);
+  results.insert<RewritePseudoCmpLTEToLT<CmpLTEI32SOp, CmpLTI32SOp>>(context);
 }
 
 OpFoldResult CmpLTEI32UOp::fold(ArrayRef<Attribute> operands) {
@@ -750,8 +777,25 @@
 void CmpLTEI32UOp::getCanonicalizationPatterns(
     OwningRewritePatternList &results, MLIRContext *context) {
   results.insert<SwapInvertedCmpOps<CmpLTEI32UOp, CmpGTI32UOp>>(context);
+  results.insert<RewritePseudoCmpLTEToLT<CmpLTEI32UOp, CmpLTI32UOp>>(context);
 }
 
+namespace {
+
+/// Rewrites a vm.cmp.gt.* pseudo op to a vm.cmp.lt.* op.
+template <typename T, typename U>
+struct RewritePseudoCmpGTToLT : public OpRewritePattern<T> {
+  using OpRewritePattern<T>::OpRewritePattern;
+  LogicalResult matchAndRewrite(T op,
+                                PatternRewriter &rewriter) const override {
+    // rhs < lhs
+    rewriter.replaceOpWithNewOp<U>(op, op.getType(), op.rhs(), op.lhs());
+    return success();
+  }
+};
+
+}  // namespace
+
 OpFoldResult CmpGTI32SOp::fold(ArrayRef<Attribute> operands) {
   if (lhs() == rhs()) {
     // x > x = false
@@ -764,6 +808,7 @@
 void CmpGTI32SOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
                                               MLIRContext *context) {
   results.insert<SwapInvertedCmpOps<CmpGTI32SOp, CmpLTEI32SOp>>(context);
+  results.insert<RewritePseudoCmpGTToLT<CmpGTI32SOp, CmpLTI32SOp>>(context);
 }
 
 OpFoldResult CmpGTI32UOp::fold(ArrayRef<Attribute> operands) {
@@ -778,8 +823,29 @@
 void CmpGTI32UOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
                                               MLIRContext *context) {
   results.insert<SwapInvertedCmpOps<CmpGTI32UOp, CmpLTEI32UOp>>(context);
+  results.insert<RewritePseudoCmpGTToLT<CmpGTI32UOp, CmpLTI32UOp>>(context);
 }
 
+namespace {
+
+/// Rewrites a vm.cmp.gte.* pseudo op to a vm.cmp.lt.* op.
+template <typename T, typename U>
+struct RewritePseudoCmpGTEToLT : public OpRewritePattern<T> {
+  using OpRewritePattern<T>::OpRewritePattern;
+  LogicalResult matchAndRewrite(T op,
+                                PatternRewriter &rewriter) const override {
+    // !(lhs < rhs)
+    auto condValue =
+        rewriter.createOrFold<U>(op.getLoc(), op.getType(), op.lhs(), op.rhs());
+    rewriter.replaceOpWithNewOp<XorI32Op>(
+        op, op.getType(), condValue,
+        rewriter.createOrFold<IREE::VM::ConstI32Op>(op.getLoc(), 1));
+    return success();
+  }
+};
+
+}  // namespace
+
 OpFoldResult CmpGTEI32SOp::fold(ArrayRef<Attribute> operands) {
   if (lhs() == rhs()) {
     // x >= x = true
@@ -792,6 +858,7 @@
 void CmpGTEI32SOp::getCanonicalizationPatterns(
     OwningRewritePatternList &results, MLIRContext *context) {
   results.insert<SwapInvertedCmpOps<CmpGTEI32SOp, CmpLTI32SOp>>(context);
+  results.insert<RewritePseudoCmpGTEToLT<CmpGTEI32SOp, CmpLTI32SOp>>(context);
 }
 
 OpFoldResult CmpGTEI32UOp::fold(ArrayRef<Attribute> operands) {
@@ -806,6 +873,7 @@
 void CmpGTEI32UOp::getCanonicalizationPatterns(
     OwningRewritePatternList &results, MLIRContext *context) {
   results.insert<SwapInvertedCmpOps<CmpGTEI32UOp, CmpLTI32UOp>>(context);
+  results.insert<RewritePseudoCmpGTEToLT<CmpGTEI32UOp, CmpLTI32UOp>>(context);
 }
 
 OpFoldResult CmpNZI32Op::fold(ArrayRef<Attribute> operands) {
diff --git a/iree/compiler/Dialect/VM/IR/VMOps.td b/iree/compiler/Dialect/VM/IR/VMOps.td
index 7c8f33c..b3a7a4e 100644
--- a/iree/compiler/Dialect/VM/IR/VMOps.td
+++ b/iree/compiler/Dialect/VM/IR/VMOps.td
@@ -1294,11 +1294,21 @@
   let hasFolder = 1;
 }
 
+def VM_ExtI8I32UOp : VM_UnaryArithmeticOp<I32, "ext.i8.i32.u", VM_OPC_ExtI8I32U> {
+  let summary = [{integer zero extend 8 bits to 32 bits}];
+  let hasFolder = 1;
+}
+
 def VM_ExtI16I32SOp : VM_UnaryArithmeticOp<I32, "ext.i16.i32.s", VM_OPC_ExtI16I32S> {
   let summary = [{integer sign extend 16 bits to 32 bits}];
   let hasFolder = 1;
 }
 
+def VM_ExtI16I32UOp : VM_UnaryArithmeticOp<I32, "ext.i16.i32.u", VM_OPC_ExtI16I32U> {
+  let summary = [{integer zero extend 16 bits to 32 bits}];
+  let hasFolder = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // Native reduction (horizontal) arithmetic
 //===----------------------------------------------------------------------===//
@@ -1362,6 +1372,27 @@
   ];
 }
 
+class VM_BinaryComparisonPseudoOp<Type type, string mnemonic,
+                                  list<OpTrait> traits = []> :
+    VM_PureOp<mnemonic, !listconcat(traits, [
+      AllTypesMatch<["lhs", "rhs"]>,
+      VM_PseudoOp,
+    ])> {
+  let description = [{
+    Compares two operands with the specified predicate.
+  }];
+
+  let arguments = (ins
+    type:$lhs,
+    type:$rhs
+  );
+  let results = (outs
+    I32:$result
+  );
+
+  let assemblyFormat = "operands attr-dict `:` type($lhs)";
+}
+
 def VM_CmpEQI32Op :
     VM_BinaryComparisonOp<I32, "cmp.eq.i32", VM_OPC_CmpEQI32, [Commutative]> {
   let summary = [{integer equality comparison operation}];
@@ -1391,43 +1422,42 @@
 }
 
 def VM_CmpLTEI32SOp :
-    VM_BinaryComparisonOp<I32, "cmp.lte.i32.s", VM_OPC_CmpLTEI32S> {
+    VM_BinaryComparisonPseudoOp<I32, "cmp.lte.i32.s"> {
   let summary = [{signed integer less-than-or-equal comparison operation}];
   let hasCanonicalizer = 1;
   let hasFolder = 1;
 }
 
 def VM_CmpLTEI32UOp :
-    VM_BinaryComparisonOp<I32, "cmp.lte.i32.u", VM_OPC_CmpLTEI32U> {
+    VM_BinaryComparisonPseudoOp<I32, "cmp.lte.i32.u"> {
   let summary = [{unsigned integer less-than-or-equal comparison operation}];
   let hasCanonicalizer = 1;
   let hasFolder = 1;
 }
 
-// TODO(benvanik): drop these and rely on lt/lte only?
 def VM_CmpGTI32SOp :
-    VM_BinaryComparisonOp<I32, "cmp.gt.i32.s", VM_OPC_CmpGTI32S> {
+    VM_BinaryComparisonPseudoOp<I32, "cmp.gt.i32.s"> {
   let summary = [{signed integer greater-than comparison operation}];
   let hasCanonicalizer = 1;
   let hasFolder = 1;
 }
 
 def VM_CmpGTI32UOp :
-    VM_BinaryComparisonOp<I32, "cmp.gt.i32.u", VM_OPC_CmpGTI32U> {
+    VM_BinaryComparisonPseudoOp<I32, "cmp.gt.i32.u"> {
   let summary = [{unsigned integer greater-than comparison operation}];
   let hasCanonicalizer = 1;
   let hasFolder = 1;
 }
 
 def VM_CmpGTEI32SOp :
-    VM_BinaryComparisonOp<I32, "cmp.gte.i32.s", VM_OPC_CmpGTEI32S> {
+    VM_BinaryComparisonPseudoOp<I32, "cmp.gte.i32.s"> {
   let summary = [{signed integer greater-than-or-equal comparison operation}];
   let hasCanonicalizer = 1;
   let hasFolder = 1;
 }
 
 def VM_CmpGTEI32UOp :
-    VM_BinaryComparisonOp<I32, "cmp.gte.i32.u", VM_OPC_CmpGTEI32U> {
+    VM_BinaryComparisonPseudoOp<I32, "cmp.gte.i32.u"> {
   let summary = [{unsigned integer greater-than-or-equal comparison operation}];
   let hasCanonicalizer = 1;
   let hasFolder = 1;
diff --git a/iree/compiler/Dialect/VM/IR/test/conversion_folding.mlir b/iree/compiler/Dialect/VM/IR/test/conversion_folding.mlir
index 319202d..02a9ea3 100644
--- a/iree/compiler/Dialect/VM/IR/test/conversion_folding.mlir
+++ b/iree/compiler/Dialect/VM/IR/test/conversion_folding.mlir
@@ -33,6 +33,14 @@
     vm.return %0 : i32
   }
 
+  // CHECK-LABEL: @ext_i8_i32_u_const
+  vm.func @ext_i8_i32_u_const() -> i32 {
+    // CHECK: vm.const.i32 255 : i32
+    %c = vm.const.i32 0x000000FF : i32
+    %0 = vm.ext.i8.i32.u %c : i32
+    vm.return %0 : i32
+  }
+
   // CHECK-LABEL: @ext_i16_i32_s_const
   vm.func @ext_i16_i32_s_const() -> i32 {
     // CHECK: vm.const.i32 -1 : i32
@@ -40,4 +48,12 @@
     %0 = vm.ext.i16.i32.s %c : i32
     vm.return %0 : i32
   }
+
+  // CHECK-LABEL: @ext_i16_i32_u_const
+  vm.func @ext_i16_i32_u_const() -> i32 {
+    // CHECK: vm.const.i32 65535 : i32
+    %c = vm.const.i32 0x0000FFFF : i32
+    %0 = vm.ext.i16.i32.u %c : i32
+    vm.return %0 : i32
+  }
 }
diff --git a/iree/compiler/Dialect/VM/IR/test/conversion_ops.mlir b/iree/compiler/Dialect/VM/IR/test/conversion_ops.mlir
index c242930..008c5d6 100644
--- a/iree/compiler/Dialect/VM/IR/test/conversion_ops.mlir
+++ b/iree/compiler/Dialect/VM/IR/test/conversion_ops.mlir
@@ -20,8 +20,12 @@
   vm.func @ext(%arg0 : i32) -> i32 {
     // CHECK-NEXT: %0 = vm.ext.i8.i32.s %arg0 : i32
     %0 = vm.ext.i8.i32.s %arg0 : i32
-    // CHECK-NEXT: %1 = vm.ext.i16.i32.s %0 : i32
-    %1 = vm.ext.i16.i32.s %0 : i32
-    vm.return %1 : i32
+    // CHECK-NEXT: %1 = vm.ext.i8.i32.u %0 : i32
+    %1 = vm.ext.i8.i32.u %0 : i32
+    // CHECK-NEXT: %2 = vm.ext.i16.i32.s %1 : i32
+    %2 = vm.ext.i16.i32.s %1 : i32
+    // CHECK-NEXT: %3 = vm.ext.i16.i32.u %2 : i32
+    %3 = vm.ext.i16.i32.u %2 : i32
+    vm.return %3 : i32
   }
 }
diff --git a/iree/compiler/Dialect/VM/Target/C/CMakeLists.txt b/iree/compiler/Dialect/VM/Target/C/CMakeLists.txt
new file mode 100644
index 0000000..5183093
--- /dev/null
+++ b/iree/compiler/Dialect/VM/Target/C/CMakeLists.txt
@@ -0,0 +1,36 @@
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+if(${IREE_ENABLE_EMITC})
+  iree_add_all_subdirs()
+  
+  iree_cc_library(
+    NAME
+      C
+    HDRS
+      "CModuleTarget.h"
+    SRCS
+      "CModuleTarget.cpp"
+      "TranslationRegistration.cpp"
+    DEPS
+      LLVMSupport
+      MLIRIR
+      MLIRPass
+      MLIRSupport
+      iree::compiler::Dialect::VM::IR
+      iree::compiler::Dialect::VM::Conversion::VMToEmitC
+    PUBLIC
+  )
+endif()
diff --git a/iree/compiler/Dialect/VM/Target/C/CModuleTarget.cpp b/iree/compiler/Dialect/VM/Target/C/CModuleTarget.cpp
new file mode 100644
index 0000000..923be34
--- /dev/null
+++ b/iree/compiler/Dialect/VM/Target/C/CModuleTarget.cpp
@@ -0,0 +1,54 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/compiler/Dialect/VM/Target/C/CModuleTarget.h"
+
+#include "iree/compiler/Dialect/VM/Conversion/VMToEmitC/ConvertVMToEmitC.h"
+#include "mlir/Pass/PassManager.h"
+
+namespace mlir {
+namespace iree_compiler {
+namespace IREE {
+namespace VM {
+
+LogicalResult translateModuleToC(IREE::VM::ModuleOp moduleOp,
+                                 llvm::raw_ostream &output) {
+  // TODO: implement translation
+  output << "// c module stub\n";
+
+  return success();
+}
+
+LogicalResult translateModuleToC(mlir::ModuleOp outerModuleOp,
+                                 llvm::raw_ostream &output) {
+  PassManager pm(outerModuleOp.getContext());
+
+  pm.addPass(createConvertVMToEmitCPass());
+
+  if (failed(pm.run(outerModuleOp))) {
+    return failure();
+  }
+
+  auto moduleOps = outerModuleOp.getOps<IREE::VM::ModuleOp>();
+  if (moduleOps.empty()) {
+    return outerModuleOp.emitError()
+           << "outer module does not contain a vm.module op";
+  }
+  return translateModuleToC(*moduleOps.begin(), output);
+}
+
+}  // namespace VM
+}  // namespace IREE
+}  // namespace iree_compiler
+}  // namespace mlir
diff --git a/iree/compiler/Dialect/VM/Target/C/CModuleTarget.h b/iree/compiler/Dialect/VM/Target/C/CModuleTarget.h
new file mode 100644
index 0000000..1268b3a
--- /dev/null
+++ b/iree/compiler/Dialect/VM/Target/C/CModuleTarget.h
@@ -0,0 +1,41 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_COMPILER_DIALECT_VM_TARGET_C_CMODULETARGET_H_
+#define IREE_COMPILER_DIALECT_VM_TARGET_C_CMODULETARGET_H_
+
+#include "iree/compiler/Dialect/VM/IR/VMOps.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/Module.h"
+#include "mlir/Support/LogicalResult.h"
+
+namespace mlir {
+namespace iree_compiler {
+namespace IREE {
+namespace VM {
+
+// Translates a vm.module to a c module.
+//
+// Exposed via the --iree-vm-ir-to-c-module translation.
+LogicalResult translateModuleToC(IREE::VM::ModuleOp moduleOp,
+                                 llvm::raw_ostream &output);
+LogicalResult translateModuleToC(mlir::ModuleOp outerModuleOp,
+                                 llvm::raw_ostream &output);
+
+}  // namespace VM
+}  // namespace IREE
+}  // namespace iree_compiler
+}  // namespace mlir
+
+#endif  // IREE_COMPILER_DIALECT_VM_TARGET_C_CMODULETARGET_H_
diff --git a/iree/compiler/Dialect/VM/Target/C/TranslationRegistration.cpp b/iree/compiler/Dialect/VM/Target/C/TranslationRegistration.cpp
new file mode 100644
index 0000000..bdeea02
--- /dev/null
+++ b/iree/compiler/Dialect/VM/Target/C/TranslationRegistration.cpp
@@ -0,0 +1,34 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/compiler/Dialect/VM/Target/C/CModuleTarget.h"
+#include "mlir/Translation.h"
+
+namespace mlir {
+namespace iree_compiler {
+namespace IREE {
+namespace VM {
+
+void registerToCTranslation() {
+  TranslateFromMLIRRegistration toCModule(
+      "iree-vm-ir-to-c-module",
+      [](mlir::ModuleOp moduleOp, llvm::raw_ostream &output) {
+        return translateModuleToC(moduleOp, output);
+      });
+}
+
+}  // namespace VM
+}  // namespace IREE
+}  // namespace iree_compiler
+}  // namespace mlir
diff --git a/kokoro/gcp_ubuntu/cmake/continuous.cfg b/iree/compiler/Dialect/VM/Target/C/test/CMakeLists.txt
similarity index 67%
copy from kokoro/gcp_ubuntu/cmake/continuous.cfg
copy to iree/compiler/Dialect/VM/Target/C/test/CMakeLists.txt
index e4cc270..6495524 100644
--- a/kokoro/gcp_ubuntu/cmake/continuous.cfg
+++ b/iree/compiler/Dialect/VM/Target/C/test/CMakeLists.txt
@@ -1,5 +1,3 @@
-# Format: //devtools/kokoro/config/proto/build.proto
-
 # Copyright 2020 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,6 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Deliberately blank as everything necessary is configured in common files, but
-# file must still exist to match corresponding (Google internal) job
-# configurations that trigger the builds.
+iree_add_all_subdirs()
+
+file(GLOB _GLOB_X_MLIR LIST_DIRECTORIES false RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} CONFIGURE_DEPENDS *.mlir)
+iree_lit_test_suite(
+  NAME
+    lit
+  SRCS
+    "${_GLOB_X_MLIR}"
+  DATA
+    iree::tools::IreeFileCheck
+    iree::tools::iree-translate
+)
diff --git a/iree/compiler/Dialect/VM/Target/C/test/empty_module.mlir b/iree/compiler/Dialect/VM/Target/C/test/empty_module.mlir
new file mode 100644
index 0000000..60ef77a
--- /dev/null
+++ b/iree/compiler/Dialect/VM/Target/C/test/empty_module.mlir
@@ -0,0 +1,5 @@
+// RUN: iree-translate -iree-vm-ir-to-c-module %s | IreeFileCheck %s
+
+// CHECK: // c module stub
+vm.module @empty_module {
+}
diff --git a/iree/compiler/Dialect/VM/Target/CMakeLists.txt b/iree/compiler/Dialect/VM/Target/CMakeLists.txt
index c2d6a1d..fb9d914 100644
--- a/iree/compiler/Dialect/VM/Target/CMakeLists.txt
+++ b/iree/compiler/Dialect/VM/Target/CMakeLists.txt
@@ -12,8 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# bazel_to_cmake: DO NOT EDIT (EmitC is only buildable with CMake)
+
 iree_add_all_subdirs()
 
+if(IREE_ENABLE_EMITC)
+  set(IREE_VM_CONDITIONAL_TARGETS
+    iree::compiler::Dialect::VM::Target::C
+  )
+endif()
+
 iree_cc_library(
   NAME
     init_targets
@@ -21,5 +29,6 @@
     "init_targets.h"
   DEPS
     iree::compiler::Dialect::VM::Target::Bytecode
+    ${IREE_VM_CONDITIONAL_TARGETS}
   PUBLIC
 )
diff --git a/iree/compiler/Dialect/VM/Target/init_targets.h b/iree/compiler/Dialect/VM/Target/init_targets.h
index 5866a77..6099564 100644
--- a/iree/compiler/Dialect/VM/Target/init_targets.h
+++ b/iree/compiler/Dialect/VM/Target/init_targets.h
@@ -21,6 +21,9 @@
 namespace IREE {
 namespace VM {
 void registerToVMBytecodeTranslation();
+#ifdef IREE_HAVE_EMITC_DIALECT
+void registerToCTranslation();
+#endif  // IREE_HAVE_EMITC_DIALECT
 }  // namespace VM
 }  // namespace IREE
 
@@ -31,6 +34,10 @@
 inline void registerVMTargets() {
   static bool init_once = []() {
     IREE::VM::registerToVMBytecodeTranslation();
+#ifdef IREE_HAVE_EMITC_DIALECT
+    IREE::VM::registerToCTranslation();
+#endif  // IREE_HAVE_EMITC_DIALECT
+
     return true;
   }();
   (void)init_once;
diff --git a/iree/compiler/Dialect/VM/Tools/VMOpTableGen.cpp b/iree/compiler/Dialect/VM/Tools/VMOpTableGen.cpp
index de17975..b3081c1 100644
--- a/iree/compiler/Dialect/VM/Tools/VMOpTableGen.cpp
+++ b/iree/compiler/Dialect/VM/Tools/VMOpTableGen.cpp
@@ -29,47 +29,33 @@
 using ::llvm::formatv;
 using ::llvm::Record;
 
-// Finds all serializable ops and emits a enum and template table for their
-// opcode and name.
-bool emitOpTableDefs(const llvm::RecordKeeper &recordKeeper, raw_ostream &os) {
-  llvm::emitSourceFileHeader("IREE VM Operation Tables", os);
-
-  std::vector<const Record *> opRecords(256);
+void emitOpTable(const llvm::RecordKeeper &recordKeeper, const Record &tableDef,
+                 raw_ostream &os) {
   std::vector<const Record *> opEncodings(256);
-  auto defs = recordKeeper.getAllDerivedDefinitions("VM_Op");
-  for (const auto *def : defs) {
-    if (def->isValueUnset("encoding")) continue;
-    auto encodingExprs = def->getValueAsListOfDefs("encoding");
-    for (auto encodingExpr : encodingExprs) {
-      if (encodingExpr->getType()->getAsString() == "VM_EncOpcode") {
-        auto *opcode = encodingExpr->getValueAsDef("opcode");
-        opRecords[opcode->getValueAsInt("value")] = def;
-        opEncodings[opcode->getValueAsInt("value")] = opcode;
-        break;
-      }
-    }
+  for (auto *opcodeDef : tableDef.getValueAsListOfDefs("enumerants")) {
+    opEncodings[opcodeDef->getValueAsInt("value")] = opcodeDef;
   }
 
   os << "typedef enum {\n";
   for (int i = 0; i < 256; ++i) {
-    auto *def = opRecords[i];
-    if (def) {
-      auto *opcode = opEncodings[i];
-      os << formatv("  IREE_VM_OP_{0} = {1}",
+    if (auto *opcode = opEncodings[i]) {
+      os << formatv("  IREE_VM_OP_{0}_{1} = {2}",
+                    tableDef.getValueAsString("opcodeEnumTag"),
                     opcode->getValueAsString("symbol"), format_hex(i, 4, true));
     } else {
-      os << formatv("  IREE_VM_OP_RSV_{0}", format_hex(i, 4, true));
+      os << formatv("  IREE_VM_OP_{0}_RSV_{1}",
+                    tableDef.getValueAsString("opcodeEnumTag"),
+                    format_hex(i, 4, true));
     }
     os << ",\n";
   }
-  os << "} iree_vm_op_t;\n";
+  os << "} " << tableDef.getValueAsString("opcodeEnumName") << ";\n";
   os << "\n";
 
-  os << "#define IREE_VM_OP_TABLE(OPC, RSV) \\\n";
+  os << formatv("#define IREE_VM_OP_{0}_TABLE(OPC, RSV) \\\n",
+                tableDef.getValueAsString("opcodeEnumTag"));
   for (int i = 0; i < 256; ++i) {
-    auto *def = opRecords[i];
-    if (def) {
-      auto *opcode = opEncodings[i];
+    if (auto *opcode = opEncodings[i]) {
       os << formatv("    OPC({0}, {1})", format_hex(i, 4, true),
                     opcode->getValueAsString("symbol"));
     } else {
@@ -80,6 +66,17 @@
     }
   }
   os << "\n\n";
+}
+
+// Finds all opcode tables in VMBase.td and emits a enum and template table for
+// their opcode and name.
+bool emitOpTableDefs(const llvm::RecordKeeper &recordKeeper, raw_ostream &os) {
+  llvm::emitSourceFileHeader("IREE VM Operation Tables", os);
+
+  auto defs = recordKeeper.getAllDerivedDefinitions("VM_OPC_EnumAttr");
+  for (const auto *def : defs) {
+    emitOpTable(recordKeeper, *def, os);
+  }
 
   return false;
 }
diff --git a/iree/compiler/Translation/CMakeLists.txt b/iree/compiler/Translation/CMakeLists.txt
index dbe2324..e89e46f 100644
--- a/iree/compiler/Translation/CMakeLists.txt
+++ b/iree/compiler/Translation/CMakeLists.txt
@@ -12,8 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# bazel_to_cmake: DO NOT EDIT (EmitC is only buildable with CMake)
+
 iree_add_all_subdirs()
 
+if(IREE_ENABLE_EMITC)
+  set(IREE_VM_CONDITIONAL_TARGETS
+    iree::compiler::Dialect::VM::Target::C
+  )
+endif()
+
 iree_cc_library(
   NAME
     IREEVM
@@ -36,5 +44,6 @@
     iree::compiler::Dialect::VM::Conversion::StandardToVM
     iree::compiler::Dialect::VM::Target::Bytecode
     iree::compiler::Dialect::VM::Transforms
+    ${IREE_VM_CONDITIONAL_TARGETS}
   PUBLIC
 )
diff --git a/iree/compiler/Translation/IREEVM.cpp b/iree/compiler/Translation/IREEVM.cpp
index 8708592..14284eb 100644
--- a/iree/compiler/Translation/IREEVM.cpp
+++ b/iree/compiler/Translation/IREEVM.cpp
@@ -23,6 +23,10 @@
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Translation.h"
 
+#ifdef IREE_HAVE_EMITC_DIALECT
+#include "iree/compiler/Dialect/VM/Target/C/CModuleTarget.h"
+#endif  // IREE_HAVE_EMITC_DIALECT
+
 namespace mlir {
 namespace iree_compiler {
 
@@ -73,10 +77,8 @@
       });
 }
 
-LogicalResult translateFromMLIRToVMBytecodeModule(
-    ModuleOp moduleOp, IREE::HAL::TargetOptions executableOptions,
-    IREE::VM::BytecodeTargetOptions bytecodeOptions,
-    llvm::raw_ostream &output) {
+static LogicalResult translateFromMLIRToVM(
+    ModuleOp moduleOp, IREE::HAL::TargetOptions executableOptions) {
   // Convert from our source to a vm.module in canonical form.
   // After this completes we have a non-bytecode-specific vm.module that we
   // could lower to other forms (LLVM IR, C, etc).
@@ -86,9 +88,22 @@
   IREE::HAL::buildHALTransformPassPipeline(passManager, executableOptions);
   IREE::VM::buildVMTransformPassPipeline(passManager);
   passManager.addPass(mlir::iree_compiler::IREE::createDropCompilerHintsPass());
+
   if (failed(passManager.run(moduleOp))) {
     return moduleOp.emitError() << "conversion from source -> vm failed";
   }
+  return success();
+}
+
+LogicalResult translateFromMLIRToVMBytecodeModule(
+    ModuleOp moduleOp, IREE::HAL::TargetOptions executableOptions,
+    IREE::VM::BytecodeTargetOptions bytecodeOptions,
+    llvm::raw_ostream &output) {
+  auto result = translateFromMLIRToVM(moduleOp, executableOptions);
+
+  if (failed(result)) {
+    return result;
+  }
 
   // Serialize to bytecode.
   return translateModuleToBytecode(moduleOp, bytecodeOptions, output);
@@ -103,10 +118,37 @@
                                              bytecodeTargetOptions, output);
 }
 
+#ifdef IREE_HAVE_EMITC_DIALECT
+LogicalResult translateFromMLIRToVMCModule(
+    ModuleOp moduleOp, IREE::HAL::TargetOptions executableOptions,
+    llvm::raw_ostream &output) {
+  auto result = translateFromMLIRToVM(moduleOp, executableOptions);
+
+  if (failed(result)) {
+    return result;
+  }
+
+  // Serialize to c code.
+  return mlir::iree_compiler::IREE::VM::translateModuleToC(moduleOp, output);
+}
+
+static LogicalResult translateFromMLIRToVMCModuleWithFlags(
+    ModuleOp moduleOp, llvm::raw_ostream &output) {
+  mlir::registerPassManagerCLOptions();
+  auto TargetOptions = IREE::HAL::getTargetOptionsFromFlags();
+  return translateFromMLIRToVMCModule(moduleOp, TargetOptions, output);
+}
+#endif  // IREE_HAVE_EMITC_DIALECT
+
 void registerIREEVMTranslation() {
   TranslateFromMLIRRegistration toVMBytecodeModuleWithFlags(
       "iree-mlir-to-vm-bytecode-module",
       translateFromMLIRToVMBytecodeModuleWithFlags);
+
+#ifdef IREE_HAVE_EMITC_DIALECT
+  TranslateFromMLIRRegistration toVMCModuleWithFlags(
+      "iree-mlir-to-vm-c-module", translateFromMLIRToVMCModuleWithFlags);
+#endif  // IREE_HAVE_EMITC_DIALECT
 }
 
 }  // namespace iree_compiler
diff --git a/iree/compiler/Translation/IREEVM.h b/iree/compiler/Translation/IREEVM.h
index 9b95a31..ed9f36a 100644
--- a/iree/compiler/Translation/IREEVM.h
+++ b/iree/compiler/Translation/IREEVM.h
@@ -53,6 +53,16 @@
     ModuleOp moduleOp, IREE::HAL::TargetOptions executableOptions,
     IREE::VM::BytecodeTargetOptions bytecodeOptions, llvm::raw_ostream &output);
 
+#ifdef IREE_HAVE_EMITC_DIALECT
+// Translates an MLIR module containing a set of supported IREE input dialects
+// to an IREE VM C module.
+//
+// Exposed via the --iree-mlir-to-vm-c-module translation.
+LogicalResult translateFromMLIRToVMCModule(
+    ModuleOp moduleOp, IREE::HAL::TargetOptions executableOptions,
+    llvm::raw_ostream &output);
+#endif  // IREE_HAVE_EMITC_DIALECT
+
 // TODO(benvanik): versions with multiple targets, etc.
 
 void registerIREEVMTransformPassPipeline();
diff --git a/iree/hal/dylib/BUILD b/iree/hal/dylib/BUILD
index 25c08ea..fc3ccb9 100644
--- a/iree/hal/dylib/BUILD
+++ b/iree/hal/dylib/BUILD
@@ -60,7 +60,6 @@
     srcs = ["dylib_executable.cc"],
     hdrs = ["dylib_executable.h"],
     deps = [
-        ":memref_runtime",
         "//iree/base:dynamic_library",
         "//iree/base:file_io",
         "//iree/base:status",
@@ -89,10 +88,3 @@
         "//iree/hal:executable_format",
     ],
 )
-
-cc_library(
-    name = "memref_runtime",
-    hdrs = [
-        "memref_runtime.h",
-    ],
-)
diff --git a/iree/hal/dylib/CMakeLists.txt b/iree/hal/dylib/CMakeLists.txt
index 7644d92..d720435 100644
--- a/iree/hal/dylib/CMakeLists.txt
+++ b/iree/hal/dylib/CMakeLists.txt
@@ -65,7 +65,6 @@
   SRCS
     "dylib_executable.cc"
   DEPS
-    ::memref_runtime
     absl::inlined_vector
     absl::span
     flatbuffers
@@ -97,11 +96,3 @@
     iree::hal::executable_format
   PUBLIC
 )
-
-iree_cc_library(
-  NAME
-    memref_runtime
-  HDRS
-    "memref_runtime.h"
-  PUBLIC
-)
diff --git a/iree/hal/dylib/dylib_executable.cc b/iree/hal/dylib/dylib_executable.cc
index e06bb19..e58a003 100644
--- a/iree/hal/dylib/dylib_executable.cc
+++ b/iree/hal/dylib/dylib_executable.cc
@@ -17,7 +17,6 @@
 #include "flatbuffers/flatbuffers.h"
 #include "iree/base/file_io.h"
 #include "iree/base/tracing.h"
-#include "iree/hal/dylib/memref_runtime.h"
 #include "iree/schemas/dylib_executable_def_generated.h"
 
 namespace iree {
@@ -96,15 +95,9 @@
 
 struct DyLibDispatchState : public HostExecutable::DispatchState {
   DyLibDispatchState() = default;
-  ~DyLibDispatchState() override {
-    for (int i = 0; i < descriptors.size(); ++i) {
-      freeUnrankedDescriptor(descriptors[i]);
-    }
-  }
-
   void* entry_function = nullptr;
-  absl::InlinedVector<UnrankedMemRefType<uint32_t>*, 4> descriptors;
   absl::InlinedVector<void*, 4> args;
+  absl::InlinedVector<int64_t, 4> push_constant;
 };
 
 StatusOr<ref_ptr<HostExecutable::DispatchState>>
@@ -127,17 +120,14 @@
                                         MemoryAccessBitfield::kWrite,
                                         io_binding.offset, io_binding.length));
       auto data = memory.mutable_data();
-      auto descriptor = allocUnrankedDescriptor<uint32_t>(data);
-      dispatch_state->descriptors.push_back(descriptor);
-      dispatch_state->args.push_back(&descriptor->descriptor);
+
+      dispatch_state->args.push_back(data);
     }
   }
-
-  auto push_constants_descriptor = allocUnrankedDescriptor<uint32_t>(
-      const_cast<uint32_t*>(params.push_constants->values.data()),
-      {static_cast<int64_t>(params.push_constants->values.size())});
-  dispatch_state->descriptors.push_back(push_constants_descriptor);
-  dispatch_state->args.push_back(&push_constants_descriptor->descriptor);
+  // TODO(ataei): Consider moving this casting to codegen side ?!
+  for (int i = 0; i < params.push_constants->values.size(); ++i) {
+    dispatch_state->push_constant.push_back(params.push_constants->values[i]);
+  }
 
   return std::move(dispatch_state);
 }
@@ -147,8 +137,10 @@
   IREE_TRACE_SCOPE0("DyLibExecutable::DispatchTile");
   auto* dispatch_state = static_cast<DyLibDispatchState*>(state);
 
-  auto entry_function = (void (*)(void**))dispatch_state->entry_function;
-  entry_function(dispatch_state->args.data());
+  auto entry_function =
+      (void (*)(void**, int64_t*))dispatch_state->entry_function;
+  entry_function(dispatch_state->args.data(),
+                 dispatch_state->push_constant.data());
 
   return OkStatus();
 }
diff --git a/iree/hal/dylib/memref_runtime.h b/iree/hal/dylib/memref_runtime.h
deleted file mode 100644
index 50d3987..0000000
--- a/iree/hal/dylib/memref_runtime.h
+++ /dev/null
@@ -1,177 +0,0 @@
-// Copyright 2020 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-#ifndef IREE_HAL_DYLIB_MEMREF_RUNTIME_H_
-#define IREE_HAL_DYLIB_MEMREF_RUNTIME_H_
-
-#include <assert.h>
-
-#include <cstdint>
-#include <vector>
-
-namespace iree {
-namespace hal {
-namespace dylib {
-
-template <int N>
-void dropFront(int64_t arr[N], int64_t *res) {
-  for (unsigned i = 1; i < N; ++i) *(res + i - 1) = arr[i];
-}
-
-/// StridedMemRef descriptor type with static rank.
-template <typename T, int N>
-struct StridedMemRefType {
-  T *basePtr;
-  T *data;
-  int64_t offset;
-  int64_t sizes[N];
-  int64_t strides[N];
-  // This operator[] is extremely slow and only for sugaring purposes.
-  StridedMemRefType<T, N - 1> operator[](int64_t idx) {
-    StridedMemRefType<T, N - 1> res;
-    res.basePtr = basePtr;
-    res.data = data;
-    res.offset = offset + idx * strides[0];
-    dropFront<N>(sizes, res.sizes);
-    dropFront<N>(strides, res.strides);
-    return res;
-  }
-};
-
-/// StridedMemRef descriptor type specialized for rank 1.
-template <typename T>
-struct StridedMemRefType<T, 1> {
-  T *basePtr;
-  T *data;
-  int64_t offset;
-  int64_t sizes[1];
-  int64_t strides[1];
-  T &operator[](int64_t idx) { return *(data + offset + idx * strides[0]); }
-};
-
-/// StridedMemRef descriptor type specialized for rank 0.
-template <typename T>
-struct StridedMemRefType<T, 0> {
-  T *basePtr;
-  T *data;
-  int64_t offset;
-};
-
-// Unranked MemRef
-template <typename T>
-struct UnrankedMemRefType {
-  int64_t rank;
-  void *descriptor;
-};
-
-// Given a shape with sizes greater than 0 along all dimensions,
-// returns the distance, in number of elements, between a slice in a dimension
-// and the next slice in the same dimension.
-//   e.g. shape[3, 4, 5] -> strides[20, 5, 1]
-inline std::vector<int64_t> makeStrides(const std::vector<int64_t> &shape) {
-  std::vector<int64_t> tmp;
-  if (shape.empty()) return tmp;
-  tmp.reserve(shape.size());
-  int64_t running = 1;
-  for (auto rit = shape.rbegin(), reit = shape.rend(); rit != reit; ++rit) {
-    assert(*rit > 0 &&
-           "size must be greater than 0 along all dimensions of shape");
-    tmp.push_back(running);
-    running *= *rit;
-  }
-  return std::vector<int64_t>(tmp.rbegin(), tmp.rend());
-}
-
-// Mallocs a StridedMemRefDescriptor<T, N>* that matches the MLIR ABI.
-// This is an implementation detail that is kept in sync with MLIR codegen
-// conventions.
-template <typename T, int N>
-StridedMemRefType<T, N> *makeStridedMemRefDescriptor(
-    void *ptr, const std::vector<int64_t> &shape) {
-  StridedMemRefType<T, N> *descriptor = static_cast<StridedMemRefType<T, N> *>(
-      malloc(sizeof(StridedMemRefType<T, N>)));
-  descriptor->basePtr = static_cast<T *>(ptr);
-  descriptor->data = static_cast<T *>(ptr);
-  descriptor->offset = 0;
-  std::copy(shape.begin(), shape.end(), descriptor->sizes);
-  auto strides = makeStrides(shape);
-  std::copy(strides.begin(), strides.end(), descriptor->strides);
-  return descriptor;
-}
-
-// Mallocs a StridedMemRefDescriptor<T, 0>* (i.e. a pointer to scalar) that
-// matches the MLIR ABI. This is an implementation detail that is kept in sync
-// with MLIR codegen conventions.
-template <typename T>
-StridedMemRefType<T, 0> *makeStridedMemRefDescriptor(
-    void *ptr, const std::vector<int64_t> &shape) {
-  StridedMemRefType<T, 0> *descriptor = static_cast<StridedMemRefType<T, 0> *>(
-      malloc(sizeof(StridedMemRefType<T, 0>)));
-  descriptor->basePtr = static_cast<T *>(ptr);
-  descriptor->data = static_cast<T *>(ptr);
-  descriptor->offset = 0;
-  return descriptor;
-}
-
-// Mallocs an UnrankedMemRefType<T>* that contains a ranked
-// StridedMemRefDescriptor<T, Rank>* and matches the MLIR ABI. This is an
-// implementation detail that is kept in sync with MLIR codegen conventions.
-template <typename T>
-UnrankedMemRefType<T> *allocUnrankedDescriptor(
-    void *data, const std::vector<int64_t> &shape) {
-  UnrankedMemRefType<T> *res = static_cast<UnrankedMemRefType<T> *>(
-      malloc(sizeof(UnrankedMemRefType<T>)));
-  res->rank = shape.size();
-  if (res->rank == 0)
-    res->descriptor = makeStridedMemRefDescriptor<T>(data, shape);
-  else if (res->rank == 1)
-    res->descriptor = makeStridedMemRefDescriptor<T, 1>(data, shape);
-  else if (res->rank == 2)
-    res->descriptor = makeStridedMemRefDescriptor<T, 2>(data, shape);
-  else if (res->rank == 3)
-    res->descriptor = makeStridedMemRefDescriptor<T, 3>(data, shape);
-  else if (res->rank == 4)
-    res->descriptor = makeStridedMemRefDescriptor<T, 4>(data, shape);
-  else if (res->rank == 5)
-    res->descriptor = makeStridedMemRefDescriptor<T, 5>(data, shape);
-  else if (res->rank == 6)
-    res->descriptor = makeStridedMemRefDescriptor<T, 6>(data, shape);
-  else
-    assert(false && "Unsupported 6+D memref descriptor");
-  return res;
-}
-
-// Shape and strides aren't used in the generated code (yet).
-// TODO(ataei): Delete this version once we can pass shapes.
-template <typename T>
-UnrankedMemRefType<T> *allocUnrankedDescriptor(void *data) {
-  UnrankedMemRefType<T> *res = static_cast<UnrankedMemRefType<T> *>(
-      malloc(sizeof(UnrankedMemRefType<T>)));
-  res->descriptor = makeStridedMemRefDescriptor<T>(data, {});
-  return res;
-}
-
-// Frees an UnrankedMemRefType<T>*
-template <typename T>
-void freeUnrankedDescriptor(UnrankedMemRefType<T> *desc) {
-  free(desc->descriptor);
-  free(desc);
-}
-
-}  // namespace dylib
-}  // namespace hal
-}  // namespace iree
-
-#endif  // IREE_HAL_DYLIB_MEMREF_RUNTIME_H_
diff --git a/iree/hal/llvmjit/BUILD b/iree/hal/llvmjit/BUILD
index 088bb8b..3ebd609 100644
--- a/iree/hal/llvmjit/BUILD
+++ b/iree/hal/llvmjit/BUILD
@@ -64,7 +64,6 @@
     srcs = ["llvmjit_executable.cc"],
     hdrs = ["llvmjit_executable.h"],
     deps = [
-        ":memref_runtime",
         "//iree/base:status",
         "//iree/base:tracing",
         "//iree/hal:buffer",
@@ -95,10 +94,3 @@
         "//iree/hal:executable_format",
     ],
 )
-
-cc_library(
-    name = "memref_runtime",
-    hdrs = [
-        "memref_runtime.h",
-    ],
-)
diff --git a/iree/hal/llvmjit/CMakeLists.txt b/iree/hal/llvmjit/CMakeLists.txt
index 8418745..ca40941 100644
--- a/iree/hal/llvmjit/CMakeLists.txt
+++ b/iree/hal/llvmjit/CMakeLists.txt
@@ -68,7 +68,6 @@
   SRCS
     "llvmjit_executable.cc"
   DEPS
-    ::memref_runtime
     LLVMAsmParser
     LLVMCore
     LLVMOrcJIT
@@ -102,11 +101,3 @@
     iree::hal::executable_format
   PUBLIC
 )
-
-iree_cc_library(
-  NAME
-    memref_runtime
-  HDRS
-    "memref_runtime.h"
-  PUBLIC
-)
diff --git a/iree/hal/llvmjit/llvmjit_executable.cc b/iree/hal/llvmjit/llvmjit_executable.cc
index 1596b9e..7d26ccd 100644
--- a/iree/hal/llvmjit/llvmjit_executable.cc
+++ b/iree/hal/llvmjit/llvmjit_executable.cc
@@ -21,7 +21,6 @@
 #include "iree/base/tracing.h"
 #include "iree/hal/buffer.h"
 #include "iree/hal/executable.h"
-#include "iree/hal/llvmjit/memref_runtime.h"
 #include "iree/schemas/llvmir_executable_def_generated.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
@@ -82,13 +81,11 @@
       make_ref<LLVMJITExecutable>(spec, std::move(ll_jit), allow_aliasing_data);
 
   for (const auto func_name : *entry_points) {
-    auto func_symbol =
-        executable->ll_jit_->lookup("invoke_" + func_name->str());
+    auto func_symbol = executable->ll_jit_->lookup(func_name->str());
     if (!func_symbol) {
       return NotFoundErrorBuilder(IREE_LOC)
              << "Can't JIT compile function : " << func_name;
     }
-    // Map function to its invoke_ symbol.
     executable->symbols_.push_back(func_symbol.get());
   }
 
@@ -111,15 +108,10 @@
 
 struct LLVMJITDispatchState : public HostExecutable::DispatchState {
   LLVMJITDispatchState() = default;
-  ~LLVMJITDispatchState() override {
-    for (int i = 0; i < descriptors.size(); ++i) {
-      freeUnrankedDescriptor(descriptors[i]);
-    }
-  }
 
   llvm::JITEvaluatedSymbol symbol;
-  llvm::SmallVector<UnrankedMemRefType<uint32_t>*, 4> descriptors;
   llvm::SmallVector<void*, 4> args;
+  llvm::SmallVector<int64_t, 4> push_constant;
 };
 
 StatusOr<ref_ptr<HostExecutable::DispatchState>>
@@ -142,17 +134,13 @@
                                         MemoryAccessBitfield::kWrite,
                                         io_binding.offset, io_binding.length));
       auto data = memory.mutable_data();
-      auto descriptor = allocUnrankedDescriptor<uint32_t>(data);
-      dispatch_state->descriptors.push_back(descriptor);
-      dispatch_state->args.push_back(&descriptor->descriptor);
+      dispatch_state->args.push_back(data);
     }
   }
-
-  auto push_constants_descriptor = allocUnrankedDescriptor<uint32_t>(
-      const_cast<uint32_t*>(params.push_constants->values.data()),
-      {static_cast<int64_t>(params.push_constants->values.size())});
-  dispatch_state->descriptors.push_back(push_constants_descriptor);
-  dispatch_state->args.push_back(&push_constants_descriptor->descriptor);
+  // TODO(ataei): Consider moving this casting to codegen side ?!
+  for (int i = 0; i < params.push_constants->values.size(); ++i) {
+    dispatch_state->push_constant.push_back(params.push_constants->values[i]);
+  }
 
   return std::move(dispatch_state);
 }
@@ -162,8 +150,9 @@
   IREE_TRACE_SCOPE0("LLVMJITExecutable::DispatchTile");
   auto* dispatch_state = static_cast<LLVMJITDispatchState*>(state);
 
-  auto func_ptr = (void (*)(void**))dispatch_state->symbol.getAddress();
-  func_ptr(dispatch_state->args.data());
+  auto func_ptr =
+      (void (*)(void**, int64_t*))dispatch_state->symbol.getAddress();
+  func_ptr(dispatch_state->args.data(), dispatch_state->push_constant.data());
 
   return OkStatus();
 }
diff --git a/iree/hal/llvmjit/memref_runtime.h b/iree/hal/llvmjit/memref_runtime.h
deleted file mode 100644
index 6b94410..0000000
--- a/iree/hal/llvmjit/memref_runtime.h
+++ /dev/null
@@ -1,177 +0,0 @@
-// Copyright 2020 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-#ifndef IREE_HAL_LLVMJIT_LLVMJIT_MEMREF_RUNTIME_H_
-#define IREE_HAL_LLVMJIT_LLVMJIT_MEMREF_RUNTIME_H_
-
-#include <assert.h>
-
-#include <cstdint>
-#include <vector>
-
-namespace iree {
-namespace hal {
-namespace llvmjit {
-
-template <int N>
-void dropFront(int64_t arr[N], int64_t *res) {
-  for (unsigned i = 1; i < N; ++i) *(res + i - 1) = arr[i];
-}
-
-/// StridedMemRef descriptor type with static rank.
-template <typename T, int N>
-struct StridedMemRefType {
-  T *basePtr;
-  T *data;
-  int64_t offset;
-  int64_t sizes[N];
-  int64_t strides[N];
-  // This operator[] is extremely slow and only for sugaring purposes.
-  StridedMemRefType<T, N - 1> operator[](int64_t idx) {
-    StridedMemRefType<T, N - 1> res;
-    res.basePtr = basePtr;
-    res.data = data;
-    res.offset = offset + idx * strides[0];
-    dropFront<N>(sizes, res.sizes);
-    dropFront<N>(strides, res.strides);
-    return res;
-  }
-};
-
-/// StridedMemRef descriptor type specialized for rank 1.
-template <typename T>
-struct StridedMemRefType<T, 1> {
-  T *basePtr;
-  T *data;
-  int64_t offset;
-  int64_t sizes[1];
-  int64_t strides[1];
-  T &operator[](int64_t idx) { return *(data + offset + idx * strides[0]); }
-};
-
-/// StridedMemRef descriptor type specialized for rank 0.
-template <typename T>
-struct StridedMemRefType<T, 0> {
-  T *basePtr;
-  T *data;
-  int64_t offset;
-};
-
-// Unranked MemRef
-template <typename T>
-struct UnrankedMemRefType {
-  int64_t rank;
-  void *descriptor;
-};
-
-// Given a shape with sizes greater than 0 along all dimensions,
-// returns the distance, in number of elements, between a slice in a dimension
-// and the next slice in the same dimension.
-//   e.g. shape[3, 4, 5] -> strides[20, 5, 1]
-inline std::vector<int64_t> makeStrides(const std::vector<int64_t> &shape) {
-  std::vector<int64_t> tmp;
-  if (shape.empty()) return tmp;
-  tmp.reserve(shape.size());
-  int64_t running = 1;
-  for (auto rit = shape.rbegin(), reit = shape.rend(); rit != reit; ++rit) {
-    assert(*rit > 0 &&
-           "size must be greater than 0 along all dimensions of shape");
-    tmp.push_back(running);
-    running *= *rit;
-  }
-  return std::vector<int64_t>(tmp.rbegin(), tmp.rend());
-}
-
-// Mallocs a StridedMemRefDescriptor<T, N>* that matches the MLIR ABI.
-// This is an implementation detail that is kept in sync with MLIR codegen
-// conventions.
-template <typename T, int N>
-StridedMemRefType<T, N> *makeStridedMemRefDescriptor(
-    void *ptr, const std::vector<int64_t> &shape) {
-  StridedMemRefType<T, N> *descriptor = static_cast<StridedMemRefType<T, N> *>(
-      malloc(sizeof(StridedMemRefType<T, N>)));
-  descriptor->basePtr = static_cast<T *>(ptr);
-  descriptor->data = static_cast<T *>(ptr);
-  descriptor->offset = 0;
-  std::copy(shape.begin(), shape.end(), descriptor->sizes);
-  auto strides = makeStrides(shape);
-  std::copy(strides.begin(), strides.end(), descriptor->strides);
-  return descriptor;
-}
-
-// Mallocs a StridedMemRefDescriptor<T, 0>* (i.e. a pointer to scalar) that
-// matches the MLIR ABI. This is an implementation detail that is kept in sync
-// with MLIR codegen conventions.
-template <typename T>
-StridedMemRefType<T, 0> *makeStridedMemRefDescriptor(
-    void *ptr, const std::vector<int64_t> &shape) {
-  StridedMemRefType<T, 0> *descriptor = static_cast<StridedMemRefType<T, 0> *>(
-      malloc(sizeof(StridedMemRefType<T, 0>)));
-  descriptor->basePtr = static_cast<T *>(ptr);
-  descriptor->data = static_cast<T *>(ptr);
-  descriptor->offset = 0;
-  return descriptor;
-}
-
-// Mallocs an UnrankedMemRefType<T>* that contains a ranked
-// StridedMemRefDescriptor<T, Rank>* and matches the MLIR ABI. This is an
-// implementation detail that is kept in sync with MLIR codegen conventions.
-template <typename T>
-UnrankedMemRefType<T> *allocUnrankedDescriptor(
-    void *data, const std::vector<int64_t> &shape) {
-  UnrankedMemRefType<T> *res = static_cast<UnrankedMemRefType<T> *>(
-      malloc(sizeof(UnrankedMemRefType<T>)));
-  res->rank = shape.size();
-  if (res->rank == 0)
-    res->descriptor = makeStridedMemRefDescriptor<T>(data, shape);
-  else if (res->rank == 1)
-    res->descriptor = makeStridedMemRefDescriptor<T, 1>(data, shape);
-  else if (res->rank == 2)
-    res->descriptor = makeStridedMemRefDescriptor<T, 2>(data, shape);
-  else if (res->rank == 3)
-    res->descriptor = makeStridedMemRefDescriptor<T, 3>(data, shape);
-  else if (res->rank == 4)
-    res->descriptor = makeStridedMemRefDescriptor<T, 4>(data, shape);
-  else if (res->rank == 5)
-    res->descriptor = makeStridedMemRefDescriptor<T, 5>(data, shape);
-  else if (res->rank == 6)
-    res->descriptor = makeStridedMemRefDescriptor<T, 6>(data, shape);
-  else
-    assert(false && "Unsupported 6+D memref descriptor");
-  return res;
-}
-
-// Shape and strides aren't used in the generated code (yet).
-// TODO(ataei): Delete this version once we can pass shapes.
-template <typename T>
-UnrankedMemRefType<T> *allocUnrankedDescriptor(void *data) {
-  UnrankedMemRefType<T> *res = static_cast<UnrankedMemRefType<T> *>(
-      malloc(sizeof(UnrankedMemRefType<T>)));
-  res->descriptor = makeStridedMemRefDescriptor<T>(data, {});
-  return res;
-}
-
-// Frees an UnrankedMemRefType<T>*
-template <typename T>
-void freeUnrankedDescriptor(UnrankedMemRefType<T> *desc) {
-  free(desc->descriptor);
-  free(desc);
-}
-
-}  // namespace llvmjit
-}  // namespace hal
-}  // namespace iree
-
-#endif  // IREE_HAL_LLVMJIT_LLVMJIT_MEMREF_RUNTIME_H_
diff --git a/iree/modules/hal/hal_module.cc b/iree/modules/hal/hal_module.cc
index 2b1d6db..e51938f 100644
--- a/iree/modules/hal/hal_module.cc
+++ b/iree/modules/hal/hal_module.cc
@@ -457,6 +457,30 @@
     return BufferViewDimsN<4>(std::move(buffer_view));
   }
 
+  Status BufferViewTrace(
+      absl::Span<const vm::ref<iree_hal_buffer_view_t>> buffer_views) {
+    // TODO(hanchung): Have better information for each dump, eg, having StrAttr
+    // for each trace event so we can map the dump to dispatch functions easier.
+    fprintf(stderr, "=== DEBUG DUMP ===\n");
+    for (auto& view : buffer_views) {
+      std::string result_str(4096, '\0');
+      iree_status_t status;
+      do {
+        iree_host_size_t actual_length = 0;
+        status = iree_hal_buffer_view_format(
+            view.get(), /*max_element_count=*/1024, result_str.size() + 1,
+            &result_str[0], &actual_length);
+        result_str.resize(actual_length);
+      } while (iree_status_is_out_of_range(status));
+      if (!iree_status_is_ok(status)) {
+        return FromApiStatus(status, IREE_LOC);
+      }
+      fprintf(stderr, "%s\n", result_str.c_str());
+    }
+    fprintf(stderr, "\n");
+    return OkStatus();
+  }
+
   //===--------------------------------------------------------------------===//
   // iree::hal::CommandBuffer
   //===--------------------------------------------------------------------===//
@@ -876,6 +900,8 @@
                            &HALModuleState::BufferViewDims3),
     vm::MakeNativeFunction("buffer_view.dims.4",
                            &HALModuleState::BufferViewDims4),
+    vm::MakeNativeFunction("buffer_view.trace",
+                           &HALModuleState::BufferViewTrace),
 
     vm::MakeNativeFunction("command_buffer.create",
                            &HALModuleState::CommandBufferCreate),
diff --git a/iree/samples/simple_embedding/simple_embedding_test.cc b/iree/samples/simple_embedding/simple_embedding_test.cc
index c99b6d7..489ac33 100644
--- a/iree/samples/simple_embedding/simple_embedding_test.cc
+++ b/iree/samples/simple_embedding/simple_embedding_test.cc
@@ -183,8 +183,8 @@
   ASSERT_API_OK(iree_hal_buffer_unmap(ret_buffer, &mapped_memory));
   LOG(INFO) << "Results match!";
 
-  iree_vm_list_deinitialize(inputs.get());
-  iree_vm_list_deinitialize(outputs.get());
+  inputs.reset();
+  outputs.reset();
   iree_hal_device_release(device);
   iree_vm_context_release(context);
   iree_vm_instance_release(instance);
diff --git a/iree/samples/vulkan/BUILD b/iree/samples/vulkan/BUILD
index b684de7..8278b40 100644
--- a/iree/samples/vulkan/BUILD
+++ b/iree/samples/vulkan/BUILD
@@ -57,6 +57,7 @@
         "//iree/vm:bytecode_module",
         "//iree/vm:ref_cc",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/types:span",
         "@dear_imgui",
         "@dear_imgui//:imgui_sdl_vulkan",
         "@iree_vulkan_headers//:vulkan_headers_no_prototypes",
diff --git a/iree/schemas/bytecode_module_def.fbs b/iree/schemas/bytecode_module_def.fbs
index 6eff279..f971093 100644
--- a/iree/schemas/bytecode_module_def.fbs
+++ b/iree/schemas/bytecode_module_def.fbs
@@ -43,7 +43,7 @@
   // Function level reflection attributes.
   // These are typically used to communicate additional ABI metadata needed
   // for dynamic invocation and host language mapping.
-  // See: docs/function_abi.md
+  // See: docs/design_docs/function_abi.md
   reflection_attrs:[ReflectionAttrDef];
 }
 
diff --git a/iree/test/e2e/regression/dynamic_torch_index_select_high_rank.mlir b/iree/test/e2e/regression/dynamic_torch_index_select_high_rank.mlir
new file mode 100644
index 0000000..ce42e31
--- /dev/null
+++ b/iree/test/e2e/regression/dynamic_torch_index_select_high_rank.mlir
@@ -0,0 +1,58 @@
+// RUN: iree-run-mlir %s -iree-hal-target-backends=llvm-ir -input-value="2x2xi32=[6, 7] [8, 9]" -input-value="2x2x2x2xi32=[[[0, 1] [1, 0]] [[0, 0] [1, 1]]] [[[1, 1] [0, 0]] [[0, 1] [1, 0]]]" | IreeFileCheck %s
+
+// CHECK-LABEL: EXEC @torch_index_select1
+func @torch_index_select1(%arg0: tensor<?x?xi32>, %arg1: tensor<?x?x?x?xi32>) -> tensor<?x?x?x?xi32> attributes {iree.module.export} {
+  %0 = "mhlo.torch_index_select"(%arg0, %arg1) {batch_dims = 1 : i64, dim = 1 : i64} : (tensor<?x?xi32>, tensor<?x?x?x?xi32>) -> tensor<?x?x?x?xi32>
+  return %0 : tensor<?x?x?x?xi32>
+}
+
+// CHECK: 2x2x2x2xi32=[
+// CHECK-SAME:   [
+// CHECK-SAME:     [6 7][7 6]
+// CHECK-SAME:   ][
+// CHECK-SAME:     [6 6][7 7]
+// CHECK-SAME:   ]
+// CHECK-SAME: ][
+// CHECK-SAME:   [
+// CHECK-SAME:     [9 9][8 8]
+// CHECK-SAME:   ][
+// CHECK-SAME:     [8 9][9 8]
+// CHECK-SAME:   ]
+// CHECK-SAME: ]
+
+// CHECK-LABEL: EXEC @torch_index_select2
+func @torch_index_select2(%arg0: tensor<?x?xi32>, %arg1: tensor<?x?x?x?xi32>) -> tensor<?x?x?x?x?xi32> attributes {iree.module.export} {
+  %0 = "mhlo.torch_index_select"(%arg0, %arg1) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<?x?xi32>, tensor<?x?x?x?xi32>) -> tensor<?x?x?x?x?xi32>
+  return %0 : tensor<?x?x?x?x?xi32>
+}
+
+// CHECK: 2x2x2x2x2xi32=[
+// CHECK-SAME:   [
+// CHECK-SAME:     [
+// CHECK-SAME:       [6 7][8 9]
+// CHECK-SAME:     ][
+// CHECK-SAME:       [8 9][6 7]
+// CHECK-SAME:     ]
+// CHECK-SAME:   ][
+// CHECK-SAME:     [
+// CHECK-SAME:       [6 7][6 7]
+// CHECK-SAME:     ][
+// CHECK-SAME:       [8 9][8 9]
+// CHECK-SAME:     ]
+// CHECK-SAME:   ]
+// CHECK-SAME: ][
+// CHECK-SAME:   [
+// CHECK-SAME:     [
+// CHECK-SAME:       [8 9][8 9]
+// CHECK-SAME:     ][
+// CHECK-SAME:       [6 7][6 7]
+// CHECK-SAME:     ]
+// CHECK-SAME:   ][
+// CHECK-SAME:     [
+// CHECK-SAME:       [6 7][8 9]
+// CHECK-SAME:     ][
+// CHECK-SAME:       [8 9][6 7]
+// CHECK-SAME:     ]
+// CHECK-SAME:   ]
+// CHECK-SAME: ]
+
diff --git a/iree/test/e2e/regression/dynamic_torch_index_select_negative.mlir b/iree/test/e2e/regression/dynamic_torch_index_select_negative.mlir
new file mode 100644
index 0000000..0755b5c
--- /dev/null
+++ b/iree/test/e2e/regression/dynamic_torch_index_select_negative.mlir
@@ -0,0 +1,14 @@
+// RUN: iree-run-mlir %s -iree-hal-target-backends=llvm-ir -input-value="2x2x2xi32=[[100, 101] [110, 111]] [[200, 201] [210, 211]]" -input-value="2x2x2xi32=[[0, 1] [1, 0]] [[0, 0] [1, 1]]" | IreeFileCheck %s
+
+// CHECK-LABEL: EXEC @torch_index_select1
+func @torch_index_select1(%arg0: tensor<?x?x?xi32>, %arg1: tensor<?x?x?xi32>) -> tensor<?x?x?xi32> attributes {iree.module.export} {
+  %0 = "mhlo.torch_index_select"(%arg0, %arg1) {batch_dims = -1 : i64, dim = -1 : i64} : (tensor<?x?x?xi32>, tensor<?x?x?xi32>) -> tensor<?x?x?xi32>
+  return %0 : tensor<?x?x?xi32>
+}
+
+// CHECK: 2x2x2xi32=[
+// CHECK-SAME:   [100 101][111 110]
+// CHECK-SAME: ][
+// CHECK-SAME:   [200 200][211 211]
+// CHECK-SAME: ]
+
diff --git a/iree/test/e2e/regression/dynamic_torch_index_select_scalar.mlir b/iree/test/e2e/regression/dynamic_torch_index_select_scalar.mlir
new file mode 100644
index 0000000..8ccb4fa
--- /dev/null
+++ b/iree/test/e2e/regression/dynamic_torch_index_select_scalar.mlir
@@ -0,0 +1,17 @@
+// RUN: iree-run-mlir %s -iree-hal-target-backends=llvm-ir -input-value="5x1x5xi32=[[1,2,3,4,5]] [[6,7,8,9,10]] [[11,12,13,14,15]] [[16,17,18,19,20]] [[21,22,23,24,25]]" -input-value="i32=0" | IreeFileCheck %s
+
+// CHECK-LABEL: EXEC @torch_index_select1
+func @torch_index_select1(%arg0: tensor<?x?x?xi32>, %arg1: tensor<i32>) -> tensor<?x?xi32> attributes {iree.module.export} {
+  %0 = "mhlo.torch_index_select"(%arg0, %arg1) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<?x?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
+  return %0 : tensor<?x?xi32>
+}
+
+// CHECK: 1x5xi32=[1 2 3 4 5]
+
+// CHECK-LABEL: EXEC @torch_index_select2
+func @torch_index_select2(%arg0: tensor<?x?x?xi32>, %arg1: tensor<i32>) -> tensor<?x?xi32> attributes {iree.module.export} {
+  %0 = "mhlo.torch_index_select"(%arg0, %arg1) {batch_dims = 0 : i64, dim = 1 : i64} : (tensor<?x?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
+  return %0 : tensor<?x?xi32>
+}
+
+// CHECK: 5x5xi32=[1 2 3 4 5][6 7 8 9 10][11 12 13 14 15][16 17 18 19 20][21 22 23 24 25]
diff --git a/iree/test/e2e/regression/dynamic_torch_index_select_vector.mlir b/iree/test/e2e/regression/dynamic_torch_index_select_vector.mlir
new file mode 100644
index 0000000..d22ccc7
--- /dev/null
+++ b/iree/test/e2e/regression/dynamic_torch_index_select_vector.mlir
@@ -0,0 +1,28 @@
+// RUN: iree-run-mlir %s -iree-hal-target-backends=llvm-ir -input-value="3x2x2xi32=[[1, 2] [3, 4]] [[5, 6] [7, 8]] [[9, 10] [11, 12]]" -input-value="2xi32=[0, 1]" | IreeFileCheck %s
+
+// CHECK-LABEL: EXEC @torch_index_select1
+func @torch_index_select1(%arg0: tensor<?x?x?xi32>, %arg1: tensor<?xi32>) -> tensor<?x?x?xi32> attributes {iree.module.export} {
+  %0 = "mhlo.torch_index_select"(%arg0, %arg1) {batch_dims = 0 : i64, dim = 1 : i64} : (tensor<?x?x?xi32>, tensor<?xi32>) -> tensor<?x?x?xi32>
+  return %0 : tensor<?x?x?xi32>
+}
+
+// CHECK: 3x2x2xi32=[
+// CHECK-SAME:   [1 2][3 4]
+// CHECK-SAME: ][
+// CHECK-SAME:   [5 6][7 8]
+// CHECK-SAME: ][
+// CHECK-SAME:   [9 10][11 12]
+// CHECK-SAME: ]
+
+// CHECK-LABEL: EXEC @torch_index_select2
+func @torch_index_select2(%arg0: tensor<?x?x?xi32>, %arg1: tensor<?xi32>) -> tensor<?x?x?xi32> attributes {iree.module.export} {
+  %0 = "mhlo.torch_index_select"(%arg0, %arg1) {batch_dims = 0 : i64, dim = 0 : i64} : (tensor<?x?x?xi32>, tensor<?xi32>) -> tensor<?x?x?xi32>
+  return %0 : tensor<?x?x?xi32>
+}
+
+// CHECK: 2x2x2xi32=[
+// CHECK-SAME:   [1 2][3 4]
+// CHECK-SAME: ][
+// CHECK-SAME:   [5 6][7 8]
+// CHECK-SAME: ]
+
diff --git a/iree/test/e2e/vulkan_specific/BUILD b/iree/test/e2e/vulkan_specific/BUILD
index 0565a86..4d20c33 100644
--- a/iree/test/e2e/vulkan_specific/BUILD
+++ b/iree/test/e2e/vulkan_specific/BUILD
@@ -29,38 +29,3 @@
     driver = "vulkan",
     target_backend = "vulkan-spirv",
 )
-
-# TODO(#2345): Merge two tests into one single file.
-iree_check_single_backend_test_suite(
-    name = "check_vulkan-spirv-split-pad-conv_vulkan",
-    srcs = [
-        "convolution1.mlir",
-        "convolution2.mlir",
-    ],
-    driver = "vulkan",
-    target_backend = "vulkan-spirv",
-)
-
-# TODO(#2345): Merge two tests into one single file.
-iree_check_single_backend_test_suite(
-    name = "check_vulkan-spirv-nosplit-pad-conv_vulkan",
-    srcs = [
-        "convolution1.mlir",
-        "convolution2.mlir",
-    ],
-    compiler_flags = ["-iree-extract-pad-from-conv=false"],
-    driver = "vulkan",
-    target_backend = "vulkan-spirv",
-)
-
-# TODO(#2345): Merge two tests into one single file.
-iree_check_single_backend_test_suite(
-    name = "check_vulkan-spirv-conv-nocontrol_vulkan",
-    srcs = [
-        "convolution1.mlir",
-        "convolution2.mlir",
-    ],
-    compiler_flags = ["-iree-codegen-use-legacy-conv-lowering=false"],
-    driver = "vulkan",
-    target_backend = "vulkan-spirv",
-)
diff --git a/iree/test/e2e/vulkan_specific/CMakeLists.txt b/iree/test/e2e/vulkan_specific/CMakeLists.txt
index cca6c58..32ee021 100644
--- a/iree/test/e2e/vulkan_specific/CMakeLists.txt
+++ b/iree/test/e2e/vulkan_specific/CMakeLists.txt
@@ -25,43 +25,3 @@
   DRIVER
     vulkan
 )
-
-iree_check_single_backend_test_suite(
-  NAME
-    check_vulkan-spirv-split-pad-conv_vulkan
-  SRCS
-    "convolution1.mlir"
-    "convolution2.mlir"
-  TARGET_BACKEND
-    vulkan-spirv
-  DRIVER
-    vulkan
-)
-
-iree_check_single_backend_test_suite(
-  NAME
-    check_vulkan-spirv-nosplit-pad-conv_vulkan
-  SRCS
-    "convolution1.mlir"
-    "convolution2.mlir"
-  TARGET_BACKEND
-    vulkan-spirv
-  DRIVER
-    vulkan
-  COMPILER_FLAGS
-    "-iree-extract-pad-from-conv=false"
-)
-
-iree_check_single_backend_test_suite(
-  NAME
-    check_vulkan-spirv-conv-nocontrol_vulkan
-  SRCS
-    "convolution1.mlir"
-    "convolution2.mlir"
-  TARGET_BACKEND
-    vulkan-spirv
-  DRIVER
-    vulkan
-  COMPILER_FLAGS
-    "-iree-codegen-use-legacy-conv-lowering=false"
-)
diff --git a/iree/test/e2e/vulkan_specific/convolution1.mlir b/iree/test/e2e/vulkan_specific/convolution1.mlir
deleted file mode 100644
index d0fc606..0000000
--- a/iree/test/e2e/vulkan_specific/convolution1.mlir
+++ /dev/null
@@ -1,66 +0,0 @@
-func @conv2d_nopadding() attributes { iree.module.export } {
-  %inputs = iree.unfoldable_constant dense<[[
-      [[ 1.0,  2.0], [ 3.0,  4.0], [ 5.0,  6.0], [ 7.0,  8.0], [ 9.0, 10.0]],
-      [[11.0, 12.0], [13.0, 14.0], [15.0, 16.0], [17.0, 18.0], [19.0, 20.0]],
-      [[21.0, 22.0], [23.0, 24.0], [25.0, 26.0], [27.0, 28.0], [29.0, 30.0]],
-      [[31.0, 32.0], [33.0, 34.0], [35.0, 36.0], [37.0, 38.0], [39.0, 40.0]]]]> : tensor<1x4x5x2xf32>
-  %weights = iree.unfoldable_constant dense<[
-      [[[ 1.0], [ 2.0]], [[ 3.0], [ 4.0]]],
-      [[[ 5.0], [ 6.0]], [[ 7.0], [ 8.0]]],
-      [[[ 9.0], [10.0]], [[11.0], [12.0]]]]> : tensor<3x2x2x1xf32>
-  %res = "mhlo.convolution"(%inputs, %weights) {
-        batch_group_count = 1 : i64,
-        dimension_numbers = {
-          input_batch_dimension = 0 : i64,
-          input_feature_dimension = 3 : i64,
-          input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>,
-          kernel_input_feature_dimension = 2 : i64,
-          kernel_output_feature_dimension = 3 : i64,
-          kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>,
-          output_batch_dimension = 0 : i64,
-          output_feature_dimension = 3 : i64,
-          output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>},
-        feature_group_count = 1 : i64,
-        rhs_dilation = dense<1> : tensor<2xi64>,
-        window_strides = dense<1> : tensor<2xi64>} : (tensor<1x4x5x2xf32>, tensor<3x2x2x1xf32>) -> tensor<1x2x3x1xf32>
-  check.expect_almost_eq_const(%res, dense<[[
-      [[1310.0],[1466.0],[1622.0]],
-      [[2090.0],[2246.0],[2402.0]]
-  ]]> : tensor<1x2x3x1xf32>) : tensor<1x2x3x1xf32>
-  return
-}
-
-func @conv2d_1452x3221_same() attributes { iree.module.export } {
-  %inputs = iree.unfoldable_constant dense<[[
-      [[ 1.0,  2.0], [ 3.0,  4.0], [ 5.0,  6.0], [ 7.0,  8.0], [ 9.0, 10.0]],
-      [[11.0, 12.0], [13.0, 14.0], [15.0, 16.0], [17.0, 18.0], [19.0, 20.0]],
-      [[21.0, 22.0], [23.0, 24.0], [25.0, 26.0], [27.0, 28.0], [29.0, 30.0]],
-      [[31.0, 32.0], [33.0, 34.0], [35.0, 36.0], [37.0, 38.0], [39.0, 40.0]]]]> : tensor<1x4x5x2xf32>
-  %weights = iree.unfoldable_constant dense<[
-      [[[ 1.0], [ 2.0]], [[ 3.0], [ 4.0]]],
-      [[[ 5.0], [ 6.0]], [[ 7.0], [ 8.0]]],
-      [[[ 9.0], [10.0]], [[11.0], [12.0]]]]> : tensor<3x2x2x1xf32>
-  %res = "mhlo.convolution"(%inputs, %weights) {
-       batch_group_count = 1 : i64,
-       dimension_numbers = {
-         input_batch_dimension = 0 : i64,
-         input_feature_dimension = 3 : i64,
-         input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>,
-         kernel_input_feature_dimension = 2 : i64,
-         kernel_output_feature_dimension = 3 : i64,
-         kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>,
-         output_batch_dimension = 0 : i64,
-         output_feature_dimension = 3 : i64,
-         output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>},
-       feature_group_count = 1 : i64,
-       padding = dense<[[1, 1], [0, 1]]> : tensor<2x2xi64>,
-       rhs_dilation = dense<1> : tensor<2xi64>,
-       window_strides = dense<1> : tensor<2xi64>} :
-       (tensor<1x4x5x2xf32>, tensor<3x2x2x1xf32>) -> tensor<1x4x5x1xf32>
-  check.expect_almost_eq_const(%res,  dense<[[
-    [[ 600.0], [ 736.0], [ 872.0], [1008.0], [ 476.0]],
-    [[1310.0], [1466.0], [1622.0], [1778.0], [ 805.0]],
-    [[2090.0], [2246.0], [2402.0], [2558.0], [1135.0]],
-    [[1080.0], [1152.0], [1224.0], [1296.0], [ 524.0]]]]> : tensor<1x4x5x1xf32>) : tensor<1x4x5x1xf32>
-  return
-}
diff --git a/iree/test/e2e/vulkan_specific/convolution2.mlir b/iree/test/e2e/vulkan_specific/convolution2.mlir
deleted file mode 100644
index ce88d5d..0000000
--- a/iree/test/e2e/vulkan_specific/convolution2.mlir
+++ /dev/null
@@ -1,140 +0,0 @@
-func @conv2d_2451x2311_same() attributes { iree.module.export } {
-  %inputs = iree.unfoldable_constant dense<[
-      [[[ 1.0], [ 2.0], [ 3.0], [ 4.0], [ 5.0]],
-       [[ 6.0], [ 7.0], [ 8.0], [ 9.0], [10.0]],
-       [[11.0], [12.0], [13.0], [14.0], [15.0]],
-       [[16.0], [17.0], [18.0], [19.0], [20.0]]],
-      [[[21.0], [22.0], [23.0], [24.0], [25.0]],
-       [[26.0], [27.0], [28.0], [29.0], [30.0]],
-       [[31.0], [32.0], [33.0], [34.0], [35.0]],
-       [[36.0], [37.0], [38.0], [39.0], [40.0]]]]> : tensor <2x4x5x1xf32>
-  %weights = iree.unfoldable_constant dense<[
-      [[[1.0]], [[2.0]], [[3.0]]],
-      [[[4.0]], [[5.0]], [[6.0]]]]> : tensor <2x3x1x1xf32>
-  %res = "mhlo.convolution"(%inputs, %weights) {
-       batch_group_count = 1 : i64,
-       dimension_numbers = {
-         input_batch_dimension = 0 : i64,
-         input_feature_dimension = 3 : i64,
-         input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>,
-         kernel_input_feature_dimension = 2 : i64,
-         kernel_output_feature_dimension = 3 : i64,
-         kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>,
-         output_batch_dimension = 0 : i64,
-         output_feature_dimension = 3 : i64,
-         output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>},
-       feature_group_count = 1 : i64,
-       padding = dense<[[0, 1], [1, 1]]> : tensor<2x2xi64>,
-       rhs_dilation = dense<1> : tensor<2xi64>,
-       window_strides = dense<1> : tensor<2xi64>} :
-       (tensor<2x4x5x1xf32>, tensor<2x3x1x1xf32>) -> tensor<2x4x5x1xf32>
-  check.expect_almost_eq_const(%res, dense<[
-    [[[ 80.0], [121.0], [142.0], [163.0], [100.0]],
-     [[160.0], [226.0], [247.0], [268.0], [160.0]],
-     [[240.0], [331.0], [352.0], [373.0], [220.0]],
-     [[ 83.0], [104.0], [110.0], [116.0], [ 59.0]]],
-    [[[400.0], [541.0], [562.0], [583.0], [340.0]],
-     [[480.0], [646.0], [667.0], [688.0], [400.0]],
-     [[560.0], [751.0], [772.0], [793.0], [460.0]],
-     [[183.0], [224.0], [230.0], [236.0], [119.0]]]]> : tensor<2x4x5x1xf32>) : tensor<2x4x5x1xf32>
-  return
-}
-
-func @conv2d_no_padding() attributes { iree.module.export } {
-  %inputs = iree.unfoldable_constant dense<[
-       [[[  1.0,   2.0,   3.0],
-         [  4.0,   5.0,   6.0],
-         [  7.0,   8.0,   9.0],
-         [ 10.0,  11.0,  12.0],
-         [ 13.0,  14.0,  15.0]],
-        [[ 16.0,  17.0,  18.0],
-         [ 19.0,  20.0,  21.0],
-         [ 22.0,  23.0,  24.0],
-         [ 25.0,  26.0,  27.0],
-         [ 28.0,  29.0,  30.0]],
-        [[ 31.0,  32.0,  33.0],
-         [ 34.0,  35.0,  36.0],
-         [ 37.0,  38.0,  39.0],
-         [ 40.0,  41.0,  42.0],
-         [ 43.0,  44.0,  45.0]],
-        [[ 46.0,  47.0,  48.0],
-         [ 49.0,  50.0,  51.0],
-         [ 52.0,  53.0,  54.0],
-         [ 55.0,  56.0,  57.0],
-         [ 58.0,  59.0,  60.0]]],
-       [[[ 61.0,  62.0,  63.0],
-         [ 64.0,  65.0,  66.0],
-         [ 67.0,  68.0,  69.0],
-         [ 70.0,  71.0,  72.0],
-         [ 73.0,  74.0,  75.0]],
-        [[ 76.0,  77.0,  78.0],
-         [ 79.0,  80.0,  81.0],
-         [ 82.0,  83.0,  84.0],
-         [ 85.0,  86.0,  87.0],
-         [ 88.0,  89.0,  90.0]],
-        [[ 91.0,  92.0,  93.0],
-         [ 94.0,  95.0,  96.0],
-         [ 97.0,  98.0,  99.0],
-         [100.0, 101.0, 102.0],
-         [103.0, 104.0, 105.0]],
-        [[106.0, 107.0, 108.0],
-         [109.0, 110.0, 111.0],
-         [112.0, 113.0, 114.0],
-         [115.0, 116.0, 117.0],
-         [118.0, 119.0, 120.0]]]]> : tensor<2x4x5x3xf32>
-  %weights = iree.unfoldable_constant dense<[
-      [[[  1.0,   2.0,   3.0,   4.0,   5.0,   6.0],
-        [  7.0,   8.0,   9.0,  10.0,  11.0,  12.0],
-        [ 13.0,  14.0,  15.0,  16.0,  17.0,  18.0]],
-       [[ 19.0,  20.0,  21.0,  22.0,  23.0,  24.0],
-        [ 25.0,  26.0,  27.0,  28.0,  29.0,  30.0],
-        [ 31.0,  32.0,  33.0,  34.0,  35.0,  36.0]],
-       [[ 37.0,  38.0,  39.0,  40.0,  41.0,  42.0],
-        [ 43.0,  44.0,  45.0,  46.0,  47.0,  48.0],
-        [ 49.0,  50.0,  51.0,  52.0,  53.0,  54.0]]],
-      [[[ 55.0,  56.0,  57.0,  58.0,  59.0,  60.0],
-        [ 61.0,  62.0,  63.0,  64.0,  65.0,  66.0],
-        [ 67.0,  68.0,  69.0,  70.0,  71.0,  72.0]],
-       [[ 73.0,  74.0,  75.0,  76.0,  77.0,  78.0],
-        [ 79.0,  80.0,  81.0,  82.0,  83.0,  84.0],
-        [ 85.0,  86.0,  87.0,  88.0,  89.0,  90.0]],
-       [[ 91.0,  92.0,  93.0,  94.0,  95.0,  96.0],
-        [ 97.0,  98.0,  99.0, 100.0, 101.0, 102.0],
-        [103.0, 104.0, 105.0, 106.0, 107.0, 108.0]]]]> : tensor<2x3x3x6xf32>
-  %res = "mhlo.convolution"(%inputs, %weights) {
-       batch_group_count = 1 : i64,
-       dimension_numbers = {
-         input_batch_dimension = 0 : i64,
-         input_feature_dimension = 3 : i64,
-         input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>,
-         kernel_input_feature_dimension = 2 : i64,
-         kernel_output_feature_dimension = 3 : i64,
-         kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>,
-         output_batch_dimension = 0 : i64,
-         output_feature_dimension = 3 : i64,
-         output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>},
-       feature_group_count = 1 : i64,
-       rhs_dilation = dense<1> : tensor<2xi64>,
-       window_strides = dense<1> : tensor<2xi64>} :
-       (tensor<2x4x5x3xf32>, tensor<2x3x3x6xf32>) -> tensor<2x3x3x6xf32>
-  check.expect_almost_eq_const(%res, dense<[
-      [[[16065.0,  16290.0,  16515.0,  16740.0,  16965.0,  17190.0],
-        [18873.0,  19152.0,  19431.0,  19710.0,  19989.0,  20268.0],
-        [21681.0,  22014.0,  22347.0,  22680.0,  23013.0,  23346.0]],
-       [[30105.0,  30600.0,  31095.0,  31590.0,  32085.0,  32580.0],
-        [32913.0,  33462.0,  34011.0,  34560.0,  35109.0,  35658.0],
-        [35721.0,  36324.0,  36927.0,  37530.0,  38133.0,  38736.0]],
-       [[44145.0,  44910.0,  45675.0,  46440.0,  47205.0,  47970.0],
-        [46953.0,  47772.0,  48591.0,  49410.0,  50229.0,  51048.0],
-        [49761.0,  50634.0,  51507.0,  52380.0,  53253.0,  54126.0]]],
-      [[[72225.0,  73530.0,  74835.0,  76140.0,  77445.0,  78750.0],
-        [75033.0,  76392.0,  77751.0,  79110.0,  80469.0,  81828.0],
-        [77841.0,  79254.0,  80667.0,  82080.0,  83493.0,  84906.0]],
-       [[86265.0,  87840.0,  89415.0,  90990.0,  92565.0,  94140.0],
-        [89073.0,  90702.0,  92331.0,  93960.0,  95589.0,  97218.0],
-        [91881.0,  93564.0,  95247.0,  96930.0,  98613.0, 100296.0]],
-       [[100305.0, 102150.0, 103995.0, 105840.0, 107685.0, 109530.0],
-        [103113.0, 105012.0, 106911.0, 108810.0, 110709.0, 112608.0],
-        [105921.0, 107874.0, 109827.0, 111780.0, 113733.0, 115686.0]]]]> : tensor<2x3x3x6xf32>) : tensor<2x3x3x6xf32>
-  return
-}
diff --git a/iree/test/e2e/xla_ops/BUILD b/iree/test/e2e/xla_ops/BUILD
index 9af6bd1..744ff22 100644
--- a/iree/test/e2e/xla_ops/BUILD
+++ b/iree/test/e2e/xla_ops/BUILD
@@ -62,6 +62,7 @@
         # "gather.mlir",
         # "gather_concat.mlir",
         #
+        "iota.mlir",
         "log.mlir",
         "maximum.mlir",
         "minimum.mlir",
@@ -82,9 +83,7 @@
         "tanh.mlir",
         "torch_index_select.mlir",
         "transpose.mlir",
-
-        # TODO(#2022): fails on real devices.
-        # "while.mlir",
+        "while.mlir",
     ],
     driver = "vulkan",
     target_backend = "vulkan-spirv",
@@ -108,6 +107,7 @@
         "divide.mlir",
         "dot.mlir",
         "exponential.mlir",
+        "iota.mlir",
         "log.mlir",
         "maximum.mlir",
         "minimum.mlir",
diff --git a/iree/test/e2e/xla_ops/CMakeLists.txt b/iree/test/e2e/xla_ops/CMakeLists.txt
index 7f65b06..e4ae959 100644
--- a/iree/test/e2e/xla_ops/CMakeLists.txt
+++ b/iree/test/e2e/xla_ops/CMakeLists.txt
@@ -46,6 +46,7 @@
     "divide.mlir"
     "dot.mlir"
     "exponential.mlir"
+    "iota.mlir"
     "log.mlir"
     "maximum.mlir"
     "minimum.mlir"
@@ -66,6 +67,7 @@
     "tanh.mlir"
     "torch_index_select.mlir"
     "transpose.mlir"
+    "while.mlir"
   TARGET_BACKEND
     vulkan-spirv
   DRIVER
@@ -91,6 +93,7 @@
     "divide.mlir"
     "dot.mlir"
     "exponential.mlir"
+    "iota.mlir"
     "log.mlir"
     "maximum.mlir"
     "minimum.mlir"
diff --git a/iree/test/e2e/xla_ops/convolution.mlir b/iree/test/e2e/xla_ops/convolution.mlir
index 22c5258..6ac1719 100644
--- a/iree/test/e2e/xla_ops/convolution.mlir
+++ b/iree/test/e2e/xla_ops/convolution.mlir
@@ -65,51 +65,47 @@
   return
 }
 
-// TODO(#2345): This test seems to fail when executed with another
-// test from this file, but passes as a standalone test. Needs further
-// investigation
-
-// func @conv2d_2451x2311_same() attributes { iree.module.export } {
-//   %inputs = iree.unfoldable_constant dense<[
-//       [[[ 1.0], [ 2.0], [ 3.0], [ 4.0], [ 5.0]],
-//        [[ 6.0], [ 7.0], [ 8.0], [ 9.0], [10.0]],
-//        [[11.0], [12.0], [13.0], [14.0], [15.0]],
-//        [[16.0], [17.0], [18.0], [19.0], [20.0]]],
-//       [[[21.0], [22.0], [23.0], [24.0], [25.0]],
-//        [[26.0], [27.0], [28.0], [29.0], [30.0]],
-//        [[31.0], [32.0], [33.0], [34.0], [35.0]],
-//        [[36.0], [37.0], [38.0], [39.0], [40.0]]]]> : tensor <2x4x5x1xf32>
-//   %weights = iree.unfoldable_constant dense<[
-//       [[[1.0]], [[2.0]], [[3.0]]],
-//       [[[4.0]], [[5.0]], [[6.0]]]]> : tensor <2x3x1x1xf32>
-//   %res = "mhlo.convolution"(%inputs, %weights) {
-//        batch_group_count = 1 : i64,
-//        dimension_numbers = {
-//          input_batch_dimension = 0 : i64,
-//          input_feature_dimension = 3 : i64,
-//          input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>,
-//          kernel_input_feature_dimension = 2 : i64,
-//          kernel_output_feature_dimension = 3 : i64,
-//          kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>,
-//          output_batch_dimension = 0 : i64,
-//          output_feature_dimension = 3 : i64,
-//          output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>},
-//        feature_group_count = 1 : i64,
-//        padding = dense<[[0, 1], [1, 1]]> : tensor<2x2xi64>,
-//        rhs_dilation = dense<1> : tensor<2xi64>,
-//        window_strides = dense<1> : tensor<2xi64>} :
-//        (tensor<2x4x5x1xf32>, tensor<2x3x1x1xf32>) -> tensor<2x4x5x1xf32>
-//   check.expect_almost_eq_const(%res, dense<[
-//     [[[ 80.0], [121.0], [142.0], [163.0], [100.0]],
-//      [[160.0], [226.0], [247.0], [268.0], [160.0]],
-//      [[240.0], [331.0], [352.0], [373.0], [220.0]],
-//      [[ 83.0], [104.0], [110.0], [116.0], [ 59.0]]],
-//     [[[400.0], [541.0], [562.0], [583.0], [340.0]],
-//      [[480.0], [646.0], [667.0], [688.0], [400.0]],
-//      [[560.0], [751.0], [772.0], [793.0], [460.0]],
-//      [[183.0], [224.0], [230.0], [236.0], [119.0]]]]> : tensor<2x4x5x1xf32>) : tensor<2x4x5x1xf32>
-//   return
-// }
+func @conv2d_2451x2311_same() attributes { iree.module.export } {
+  %inputs = iree.unfoldable_constant dense<[
+      [[[ 1.0], [ 2.0], [ 3.0], [ 4.0], [ 5.0]],
+       [[ 6.0], [ 7.0], [ 8.0], [ 9.0], [10.0]],
+       [[11.0], [12.0], [13.0], [14.0], [15.0]],
+       [[16.0], [17.0], [18.0], [19.0], [20.0]]],
+      [[[21.0], [22.0], [23.0], [24.0], [25.0]],
+       [[26.0], [27.0], [28.0], [29.0], [30.0]],
+       [[31.0], [32.0], [33.0], [34.0], [35.0]],
+       [[36.0], [37.0], [38.0], [39.0], [40.0]]]]> : tensor <2x4x5x1xf32>
+  %weights = iree.unfoldable_constant dense<[
+      [[[1.0]], [[2.0]], [[3.0]]],
+      [[[4.0]], [[5.0]], [[6.0]]]]> : tensor <2x3x1x1xf32>
+  %res = "mhlo.convolution"(%inputs, %weights) {
+       batch_group_count = 1 : i64,
+       dimension_numbers = {
+         input_batch_dimension = 0 : i64,
+         input_feature_dimension = 3 : i64,
+         input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>,
+         kernel_input_feature_dimension = 2 : i64,
+         kernel_output_feature_dimension = 3 : i64,
+         kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>,
+         output_batch_dimension = 0 : i64,
+         output_feature_dimension = 3 : i64,
+         output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>},
+       feature_group_count = 1 : i64,
+       padding = dense<[[0, 1], [1, 1]]> : tensor<2x2xi64>,
+       rhs_dilation = dense<1> : tensor<2xi64>,
+       window_strides = dense<1> : tensor<2xi64>} :
+       (tensor<2x4x5x1xf32>, tensor<2x3x1x1xf32>) -> tensor<2x4x5x1xf32>
+  check.expect_almost_eq_const(%res, dense<[
+    [[[ 80.0], [121.0], [142.0], [163.0], [100.0]],
+     [[160.0], [226.0], [247.0], [268.0], [160.0]],
+     [[240.0], [331.0], [352.0], [373.0], [220.0]],
+     [[ 83.0], [104.0], [110.0], [116.0], [ 59.0]]],
+    [[[400.0], [541.0], [562.0], [583.0], [340.0]],
+     [[480.0], [646.0], [667.0], [688.0], [400.0]],
+     [[560.0], [751.0], [772.0], [793.0], [460.0]],
+     [[183.0], [224.0], [230.0], [236.0], [119.0]]]]> : tensor<2x4x5x1xf32>) : tensor<2x4x5x1xf32>
+  return
+}
 
 func @conv2d_no_padding2() attributes { iree.module.export } {
   %inputs = iree.unfoldable_constant dense<[
diff --git a/iree/test/e2e/xla_ops/iota.mlir b/iree/test/e2e/xla_ops/iota.mlir
new file mode 100644
index 0000000..c3ff040
--- /dev/null
+++ b/iree/test/e2e/xla_ops/iota.mlir
@@ -0,0 +1,16 @@
+func @iota_dim0() attributes { iree.module.export } {
+  %result = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<2x3xf32>
+  check.expect_almost_eq_const(%result, dense<[
+    [0.0, 0.0, 0.0],
+    [1.0, 1.0, 1.0]]> : tensor<2x3xf32>) : tensor<2x3xf32>
+  return
+}
+
+
+func @iota_dim1() attributes { iree.module.export } {
+  %result = "mhlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<2x3xf32>
+  check.expect_almost_eq_const(%result, dense<[
+    [0.0, 1.0, 2.0],
+    [0.0, 1.0, 2.0]]> : tensor<2x3xf32>) : tensor<2x3xf32>
+  return
+}
diff --git a/iree/test/e2e/xla_ops/pad.mlir b/iree/test/e2e/xla_ops/pad.mlir
index 7f6df37..537e684 100644
--- a/iree/test/e2e/xla_ops/pad.mlir
+++ b/iree/test/e2e/xla_ops/pad.mlir
@@ -20,3 +20,19 @@
   check.expect_eq(%res, %input) : tensor<2x3xi32>
   return
 }
+
+func @pad_with_interior_padding() attributes { iree.module.export } {
+  %input = iree.unfoldable_constant dense<[[1, 2, 3], [4, 5, 6]]> : tensor<2x3xi32>
+  %c0 = iree.unfoldable_constant dense<0> : tensor<i32>
+  %res = "mhlo.pad"(%input, %c0) {
+    edge_padding_low = dense<[0, 1]> : tensor<2xi64>,
+    edge_padding_high = dense<[1, 5]> : tensor<2xi64>,
+    interior_padding = dense<[1, 2]> : tensor<2xi64>
+  } : (tensor<2x3xi32>, tensor<i32>) -> tensor<4x13xi32>
+  check.expect_eq_const(%res, dense<[
+      [0, 1, 0, 0, 2, 0, 0, 3, 0, 0, 0, 0, 0],
+      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+      [0, 4, 0, 0, 5, 0, 0, 6, 0, 0, 0, 0, 0],
+      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]> : tensor<4x13xi32>) : tensor<4x13xi32>
+  return
+}
diff --git a/iree/tools/CMakeLists.txt b/iree/tools/CMakeLists.txt
index 93f0b2f..a277877 100644
--- a/iree/tools/CMakeLists.txt
+++ b/iree/tools/CMakeLists.txt
@@ -24,6 +24,16 @@
     "iree::hal::vulkan::vulkan_driver_module"
 )
 
+if(IREE_ENABLE_EMITC)
+  set(IREE_OPT_CONDITIONAL_DEPS
+    MLIREmitC
+  )
+  set(IREE_TRANSLATE_CONDITIONAL_DEPS
+    MLIREmitC
+    MLIRTargetCpp
+  )
+endif()
+
 iree_cc_binary(
   NAME
     iree-benchmark-module
@@ -255,6 +265,7 @@
       iree::compiler::Dialect::Vulkan::IR
       iree::compiler::Translation::IREEVM
       tensorflow::mlir_hlo
+      ${IREE_OPT_CONDITIONAL_DEPS}
     PUBLIC
   )
 
@@ -280,6 +291,7 @@
       iree::compiler::Dialect::VM::Target::Bytecode
       iree::compiler::Dialect::VM::Target::init_targets
       iree::compiler::Translation::IREEVM
+      ${IREE_TRANSLATE_CONDITIONAL_DEPS}
     PUBLIC
   )
 
diff --git a/iree/tools/init_mlir_passes.h b/iree/tools/init_mlir_passes.h
index 814d844..58dede2 100644
--- a/iree/tools/init_mlir_passes.h
+++ b/iree/tools/init_mlir_passes.h
@@ -33,6 +33,7 @@
 #include "mlir/Dialect/Quant/Passes.h"
 #include "mlir/Dialect/SCF/Passes.h"
 #include "mlir/Dialect/SPIRV/Passes.h"
+#include "mlir/Dialect/Shape/Transforms/Passes.h"
 #include "mlir/Transforms/LocationSnapshot.h"
 #include "mlir/Transforms/Passes.h"
 
@@ -136,6 +137,10 @@
   quant::createConvertSimulatedQuantPass();
   quant::createConvertConstPass();
 
+  // Shape
+#define GEN_PASS_REGISTRATION
+#include "mlir/Dialect/Shape/Transforms/Passes.h.inc"
+
   // SPIR-V
   spirv::createLowerABIAttributesPass();
   createConvertGPUToSPIRVPass();
diff --git a/iree/tools/opt_main.cc b/iree/tools/opt_main.cc
index 57ebe15..260901c 100644
--- a/iree/tools/opt_main.cc
+++ b/iree/tools/opt_main.cc
@@ -37,6 +37,10 @@
 #include "mlir/Support/FileUtilities.h"
 #include "mlir/Support/MlirOptMain.h"
 
+#ifdef IREE_HAVE_EMITC_DIALECT
+#include "emitc/InitDialect.h"
+#endif  // IREE_HAVE_EMITC_DIALECT
+
 static llvm::cl::opt<std::string> inputFilename(llvm::cl::Positional,
                                                 llvm::cl::desc("<input file>"),
                                                 llvm::cl::init("-"));
@@ -74,6 +78,9 @@
 int main(int argc, char **argv) {
   mlir::registerMlirDialects();
   mlir::registerMlirPasses();
+#ifdef IREE_HAVE_EMITC_DIALECT
+  mlir::registerEmitCDialect();
+#endif  // IREE_HAVE_EMITC_DIALECT
   mlir::registerXLADialects();
   mlir::iree_compiler::registerIreeDialects();
   mlir::iree_compiler::registerIreeCompilerModuleDialects();
diff --git a/iree/tools/run_module_main.cc b/iree/tools/run_module_main.cc
index 96d0584..41baed1 100644
--- a/iree/tools/run_module_main.cc
+++ b/iree/tools/run_module_main.cc
@@ -142,8 +142,8 @@
   RETURN_IF_ERROR(PrintVariantList(output_descs, outputs.get()))
       << "printing results";
 
-  iree_vm_list_deinitialize(inputs.get());
-  iree_vm_list_deinitialize(outputs.get());
+  inputs.reset();
+  outputs.reset();
   iree_vm_module_release(hal_module);
   iree_vm_module_release(input_module);
   iree_hal_device_release(device);
diff --git a/iree/tools/translate_main.cc b/iree/tools/translate_main.cc
index 97b4334..50edac7 100644
--- a/iree/tools/translate_main.cc
+++ b/iree/tools/translate_main.cc
@@ -38,6 +38,11 @@
 #include "mlir/Support/ToolUtilities.h"
 #include "mlir/Translation.h"
 
+#ifdef IREE_HAVE_EMITC_DIALECT
+#include "emitc/InitDialect.h"
+#include "emitc/InitTranslation.h"
+#endif  // IREE_HAVE_EMITC_DIALECT
+
 static llvm::cl::opt<std::string> inputFilename(llvm::cl::Positional,
                                                 llvm::cl::desc("<input file>"),
                                                 llvm::cl::init("-"));
@@ -56,12 +61,18 @@
   llvm::InitLLVM y(argc, argv);
 
   mlir::registerMlirDialects();
+#ifdef IREE_HAVE_EMITC_DIALECT
+  mlir::registerEmitCDialect();
+#endif  // IREE_HAVE_EMITC_DIALECT
   mlir::registerXLADialects();
   mlir::iree_compiler::registerIreeDialects();
   mlir::iree_compiler::registerIreeCompilerModuleDialects();
   mlir::iree_compiler::registerHALTargetBackends();
   mlir::iree_compiler::registerVMTargets();
   mlir::registerMlirTranslations();
+#ifdef IREE_HAVE_EMITC_DIALECT
+  mlir::registerEmitCTranslation();
+#endif  // IREE_HAVE_EMITC_DIALECT
   mlir::iree_compiler::registerIreeTranslations();
   mlir::iree_compiler::registerLinalgToSPIRVPasses();
 
diff --git a/iree/vm/bytecode_dispatch.c b/iree/vm/bytecode_dispatch.c
index 86b5d1d..2565c5e 100644
--- a/iree/vm/bytecode_dispatch.c
+++ b/iree/vm/bytecode_dispatch.c
@@ -209,7 +209,7 @@
 #define DECLARE_DISPATCH_OPC(ordinal, name) &&_dispatch_##name,
 #define DECLARE_DISPATCH_RSV(ordinal) &&_dispatch_unhandled,
   static const void* kDispatchTable[256] = {
-      IREE_VM_OP_TABLE(DECLARE_DISPATCH_OPC, DECLARE_DISPATCH_RSV)};
+      IREE_VM_OP_CORE_TABLE(DECLARE_DISPATCH_OPC, DECLARE_DISPATCH_RSV)};
 
 #define DISPATCH_UNHANDLED() \
   _dispatch_unhandled:       \
@@ -238,7 +238,7 @@
     return IREE_STATUS_UNIMPLEMENTED;
 
 #define DISPATCH_OP(op_name, body)      \
-  case IREE_VM_OP_##op_name:            \
+  case IREE_VM_OP_CORE_##op_name:       \
     IREE_DISPATCH_LOG_OPCODE(#op_name); \
     body;                               \
     break;
@@ -608,7 +608,9 @@
     DISPATCH_OP_CAST_I32(TruncI32I8, uint8_t, uint32_t);
     DISPATCH_OP_CAST_I32(TruncI32I16, uint16_t, uint32_t);
     DISPATCH_OP_CAST_I32(ExtI8I32S, int8_t, int32_t);
+    DISPATCH_OP_CAST_I32(ExtI8I32U, uint8_t, uint32_t);
     DISPATCH_OP_CAST_I32(ExtI16I32S, int16_t, int32_t);
+    DISPATCH_OP_CAST_I32(ExtI16I32U, uint16_t, uint32_t);
 
     //===------------------------------------------------------------------===//
     // Native bitwise shifts and rotates
@@ -642,12 +644,6 @@
     DISPATCH_OP_CMP_I32(CmpNEI32, int32_t, !=);
     DISPATCH_OP_CMP_I32(CmpLTI32S, int32_t, <);
     DISPATCH_OP_CMP_I32(CmpLTI32U, uint32_t, <);
-    DISPATCH_OP_CMP_I32(CmpLTEI32S, int32_t, <=);
-    DISPATCH_OP_CMP_I32(CmpLTEI32U, uint32_t, <=);
-    DISPATCH_OP_CMP_I32(CmpGTI32S, int32_t, >);
-    DISPATCH_OP_CMP_I32(CmpGTI32U, uint32_t, >);
-    DISPATCH_OP_CMP_I32(CmpGTEI32S, int32_t, >=);
-    DISPATCH_OP_CMP_I32(CmpGTEI32U, uint32_t, >=);
     DISPATCH_OP(CmpNZI32, {
       int32_t operand = VM_DecOperandRegI32("operand");
       int32_t* result = VM_DecResultRegI32("result");
@@ -891,6 +887,16 @@
       pc = block_pc;
     });
 
+    //===------------------------------------------------------------------===//
+    // Extension trampolines
+    //===------------------------------------------------------------------===//
+
+    DISPATCH_OP(PrefixExtI64, { return IREE_STATUS_UNIMPLEMENTED; });
+
+    DISPATCH_OP(PrefixExtF32, { return IREE_STATUS_UNIMPLEMENTED; });
+
+    DISPATCH_OP(PrefixExtF64, { return IREE_STATUS_UNIMPLEMENTED; });
+
     // NOLINTNEXTLINE(misc-static-assert)
     DISPATCH_UNHANDLED();
   }
diff --git a/iree/vm/module.h b/iree/vm/module.h
index e278992..3081e10 100644
--- a/iree/vm/module.h
+++ b/iree/vm/module.h
@@ -210,7 +210,7 @@
   // attributes.
   // Returns IREE_STATUS_NOT_FOUND if index >= the number of attributes for
   // the function.
-  // See: docs/function_abi.md
+  // See: docs/design_docs/function_abi.md
   iree_status_t(IREE_API_PTR* get_function_reflection_attr)(
       void* self, iree_vm_function_linkage_t linkage, int32_t ordinal,
       int32_t index, iree_string_view_t* key, iree_string_view_t* value);
@@ -277,7 +277,7 @@
 // Returns the empty string if the reflection data in general or the specific
 // key is not found.
 //
-// See: docs/function_abi.md for documentation on the ABI.
+// See: docs/design_docs/function_abi.md for documentation on the ABI.
 IREE_API_EXPORT iree_string_view_t IREE_API_CALL
 iree_vm_function_reflection_attr(const iree_vm_function_t* function,
                                  iree_string_view_t key);
@@ -289,7 +289,7 @@
 // attributes.
 // Returns IREE_STATUS_NOT_FOUND if index >= the number of attributes for
 // the function.
-// See: docs/function_abi.md
+// See: docs/design_docs/function_abi.md
 IREE_API_EXPORT iree_status_t IREE_API_CALL
 iree_vm_get_function_reflection_attr(iree_vm_function_t function, int32_t index,
                                      iree_string_view_t* key,
diff --git a/iree/vm/test/BUILD b/iree/vm/test/BUILD
index 5018865..c33c07d 100644
--- a/iree/vm/test/BUILD
+++ b/iree/vm/test/BUILD
@@ -25,6 +25,7 @@
     name = "all_bytecode_modules_cc",
     srcs = [
         ":arithmetic_ops.module",
+        ":comparison_ops.module",
         ":control_flow_ops.module",
         ":list_ops.module",
     ],
@@ -41,6 +42,12 @@
 )
 
 iree_bytecode_module(
+    name = "comparison_ops",
+    src = "comparison_ops.mlir",
+    flags = ["-iree-vm-ir-to-bytecode-module"],
+)
+
+iree_bytecode_module(
     name = "control_flow_ops",
     src = "control_flow_ops.mlir",
     flags = ["-iree-vm-ir-to-bytecode-module"],
diff --git a/iree/vm/test/CMakeLists.txt b/iree/vm/test/CMakeLists.txt
index cb6951f..747595b 100644
--- a/iree/vm/test/CMakeLists.txt
+++ b/iree/vm/test/CMakeLists.txt
@@ -19,6 +19,7 @@
     all_bytecode_modules_cc
   GENERATED_SRCS
     "arithmetic_ops.module"
+    "comparison_ops.module"
     "control_flow_ops.module"
     "list_ops.module"
   CC_FILE_OUTPUT
@@ -43,6 +44,16 @@
 
 iree_bytecode_module(
   NAME
+    comparison_ops
+  SRC
+    "comparison_ops.mlir"
+  FLAGS
+    "-iree-vm-ir-to-bytecode-module"
+  PUBLIC
+)
+
+iree_bytecode_module(
+  NAME
     control_flow_ops
   SRC
     "control_flow_ops.mlir"
diff --git a/iree/vm/test/comparison_ops.mlir b/iree/vm/test/comparison_ops.mlir
new file mode 100644
index 0000000..7161cb6
--- /dev/null
+++ b/iree/vm/test/comparison_ops.mlir
@@ -0,0 +1,172 @@
+vm.module @comparison_ops {
+
+  //===--------------------------------------------------------------------===//
+  // vm.cmp.lt.i32.s
+  //===--------------------------------------------------------------------===//
+
+  vm.export @test_cmp_lt_s_0
+  vm.func @test_cmp_lt_s_0() {
+    %lhs = vm.const.i32 2 : i32
+    %lhs_dno = iree.do_not_optimize(%lhs) : i32
+    %rhs = vm.const.i32 -2 : i32
+    %rhs_dno = iree.do_not_optimize(%rhs) : i32
+    %actual = vm.cmp.lt.i32.s %lhs_dno, %rhs_dno : i32
+    %expected = vm.const.i32 0 : i32
+    vm.check.eq %actual, %expected, "2 < -2" : i32
+    vm.return
+  }
+
+  vm.export @test_cmp_lt_s_1
+  vm.func @test_cmp_lt_s_1() {
+    %lhs = vm.const.i32 -2 : i32
+    %lhs_dno = iree.do_not_optimize(%lhs) : i32
+    %rhs = vm.const.i32 2 : i32
+    %rhs_dno = iree.do_not_optimize(%rhs) : i32
+    %actual = vm.cmp.lt.i32.s %lhs_dno, %rhs_dno : i32
+    %expected = vm.const.i32 1 : i32
+    vm.check.eq %actual, %expected, "-2 < 2" : i32
+    vm.return
+  }
+
+  // Expect UINT_MAX to be interpreted as -1 when doing a signed compare.
+  vm.export @test_cmp_lt_s_2
+  vm.func @test_cmp_lt_s_2() {
+    %lhs = vm.const.i32 4294967295 : i32
+    %lhs_dno = iree.do_not_optimize(%lhs) : i32
+    %rhs = vm.const.i32 2 : i32
+    %rhs_dno = iree.do_not_optimize(%rhs) : i32
+    %actual = vm.cmp.lt.i32.s %lhs_dno, %rhs_dno : i32
+    %expected = vm.const.i32 1 : i32
+    vm.check.eq %actual, %expected, "4294967295 (UINT_MAX) < 2" : i32
+    vm.return
+  }
+
+  //===--------------------------------------------------------------------===//
+  // vm.cmp.lt.i32.u
+  //===--------------------------------------------------------------------===//
+
+  vm.export @test_cmp_lt_u_0
+  vm.func @test_cmp_lt_u_0() {
+    %lhs = vm.const.i32 2 : i32
+    %lhs_dno = iree.do_not_optimize(%lhs) : i32
+    %rhs = vm.const.i32 -2 : i32
+    %rhs_dno = iree.do_not_optimize(%rhs) : i32
+    %actual = vm.cmp.lt.i32.u %lhs_dno, %rhs_dno : i32
+    %expected = vm.const.i32 1 : i32
+    vm.check.eq %actual, %expected, "2 < -2 (as unsigned)" : i32
+    vm.return
+  }
+
+  vm.export @test_cmp_lt_u_1
+  vm.func @test_cmp_lt_u_1() {
+    %lhs = vm.const.i32 -2 : i32
+    %lhs_dno = iree.do_not_optimize(%lhs) : i32
+    %rhs = vm.const.i32 2 : i32
+    %rhs_dno = iree.do_not_optimize(%rhs) : i32
+    %actual = vm.cmp.lt.i32.u %lhs_dno, %rhs_dno : i32
+    %expected = vm.const.i32 0 : i32
+    vm.check.eq %actual, %expected, "-2 < 2 (as unsigned)" : i32
+    vm.return
+  }
+
+  vm.export @test_cmp_lt_u_2
+  vm.func @test_cmp_lt_u_2() {
+    %lhs = vm.const.i32 4294967295 : i32
+    %lhs_dno = iree.do_not_optimize(%lhs) : i32
+    %rhs = vm.const.i32 2 : i32
+    %rhs_dno = iree.do_not_optimize(%rhs) : i32
+    %actual = vm.cmp.lt.i32.u %lhs_dno, %rhs_dno : i32
+    %expected = vm.const.i32 0 : i32
+    vm.check.eq %actual, %expected, "4294967295 (UINT_MAX) < 2 (as unsigned)" : i32
+    vm.return
+  }
+
+  //===--------------------------------------------------------------------===//
+  // vm.cmp.*.i32.* pseudo-ops
+  //===--------------------------------------------------------------------===//
+  // NOTE: all of these are turned in to some variants of vm.cmp.lt by the
+  // compiler and are here as a way to test the runtime behavior of the
+  // pseudo-op expansions.
+
+  vm.export @test_cmp_lte
+  vm.func @test_cmp_lte() {
+    %true = vm.const.i32 1 : i32
+    %false = vm.const.i32 0 : i32
+
+    %cn2 = vm.const.i32 -2 : i32
+    %cn2_dno = iree.do_not_optimize(%cn2) : i32
+    %c2 = vm.const.i32 2 : i32
+    %c2_dno = iree.do_not_optimize(%c2) : i32
+
+    %cmp_0 = vm.cmp.lte.i32.s %cn2_dno, %c2_dno : i32
+    vm.check.eq %cmp_0, %true, "-2 <= 2" : i32
+    %cmp_1 = vm.cmp.lte.i32.s %c2_dno, %cn2_dno : i32
+    vm.check.eq %cmp_1, %false, "2 <= -2" : i32
+    %cmp_2 = vm.cmp.lte.i32.s %c2_dno, %c2_dno : i32
+    vm.check.eq %cmp_2, %true, "2 <= 2" : i32
+
+    %cmp_3 = vm.cmp.lte.i32.u %cn2_dno, %c2_dno : i32
+    vm.check.eq %cmp_3, %false, "-2 <= 2 (unsigned)" : i32
+    %cmp_4 = vm.cmp.lte.i32.u %c2_dno, %cn2_dno : i32
+    vm.check.eq %cmp_4, %true, "2 <= -2 (unsigned)" : i32
+    %cmp_5 = vm.cmp.lte.i32.u %c2_dno, %c2_dno : i32
+    vm.check.eq %cmp_5, %true, "2 <= 2 (unsigned)" : i32
+
+    vm.return
+  }
+
+  vm.export @test_cmp_gt
+  vm.func @test_cmp_gt() {
+    %true = vm.const.i32 1 : i32
+    %false = vm.const.i32 0 : i32
+
+    %cn2 = vm.const.i32 -2 : i32
+    %cn2_dno = iree.do_not_optimize(%cn2) : i32
+    %c2 = vm.const.i32 2 : i32
+    %c2_dno = iree.do_not_optimize(%c2) : i32
+
+    %cmp_0 = vm.cmp.gt.i32.s %cn2_dno, %c2_dno : i32
+    vm.check.eq %cmp_0, %false, "-2 > 2" : i32
+    %cmp_1 = vm.cmp.gt.i32.s %c2_dno, %cn2_dno : i32
+    vm.check.eq %cmp_1, %true, "2 > -2" : i32
+    %cmp_2 = vm.cmp.gt.i32.s %c2_dno, %c2_dno : i32
+    vm.check.eq %cmp_2, %false, "2 > 2" : i32
+
+    %cmp_3 = vm.cmp.gt.i32.u %cn2_dno, %c2_dno : i32
+    vm.check.eq %cmp_3, %true, "-2 > 2 (unsigned)" : i32
+    %cmp_4 = vm.cmp.gt.i32.u %c2_dno, %cn2_dno : i32
+    vm.check.eq %cmp_4, %false, "2 > -2 (unsigned)" : i32
+    %cmp_5 = vm.cmp.gt.i32.u %c2_dno, %c2_dno : i32
+    vm.check.eq %cmp_5, %false, "2 > 2 (unsigned)" : i32
+
+    vm.return
+  }
+
+  vm.export @test_cmp_gte
+  vm.func @test_cmp_gte() {
+    %true = vm.const.i32 1 : i32
+    %false = vm.const.i32 0 : i32
+
+    %cn2 = vm.const.i32 -2 : i32
+    %cn2_dno = iree.do_not_optimize(%cn2) : i32
+    %c2 = vm.const.i32 2 : i32
+    %c2_dno = iree.do_not_optimize(%c2) : i32
+
+    %cmp_0 = vm.cmp.gte.i32.s %cn2_dno, %c2_dno : i32
+    vm.check.eq %cmp_0, %false, "-2 >= 2" : i32
+    %cmp_1 = vm.cmp.gte.i32.s %c2_dno, %cn2_dno : i32
+    vm.check.eq %cmp_1, %true, "2 >= -2" : i32
+    %cmp_2 = vm.cmp.gte.i32.s %c2_dno, %c2_dno : i32
+    vm.check.eq %cmp_2, %true, "2 >= 2" : i32
+
+    %cmp_3 = vm.cmp.gte.i32.u %cn2_dno, %c2_dno : i32
+    vm.check.eq %cmp_3, %true, "-2 >= 2 (unsigned)" : i32
+    %cmp_4 = vm.cmp.gte.i32.u %c2_dno, %cn2_dno : i32
+    vm.check.eq %cmp_4, %false, "2 >= -2 (unsigned)" : i32
+    %cmp_5 = vm.cmp.gte.i32.u %c2_dno, %c2_dno : i32
+    vm.check.eq %cmp_5, %true, "2 >= 2 (unsigned)" : i32
+
+    vm.return
+  }
+
+}
diff --git a/kokoro/gcp_ubuntu/bazel/bindings/common.cfg b/kokoro/gcp_ubuntu/bazel/bindings/common.cfg
index d4a4e26..8a49430 100644
--- a/kokoro/gcp_ubuntu/bazel/bindings/common.cfg
+++ b/kokoro/gcp_ubuntu/bazel/bindings/common.cfg
@@ -17,4 +17,4 @@
 # Common configuration for Kokoro builds that run the bindings build with bazel
 # on linux.
 
-build_file: "iree/kokoro/gcp_ubuntu/bazel/bindings/build_kokoro.sh"
+build_file: "iree/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/bindings/build_kokoro.sh"
diff --git a/kokoro/gcp_ubuntu/bazel/core/common.cfg b/kokoro/gcp_ubuntu/bazel/core/common.cfg
index b8ccaa7..3a22d10 100755
--- a/kokoro/gcp_ubuntu/bazel/core/common.cfg
+++ b/kokoro/gcp_ubuntu/bazel/core/common.cfg
@@ -17,4 +17,4 @@
 # Common configuration for Kokoro builds that run the core build with bazel on
 # linux.
 
-build_file: "iree/kokoro/gcp_ubuntu/bazel/core/build_kokoro.sh"
+build_file: "iree/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/core/build_kokoro.sh"
diff --git a/kokoro/gcp_ubuntu/bazel/integrations/common.cfg b/kokoro/gcp_ubuntu/bazel/integrations/common.cfg
index 9331508..eb31e55 100644
--- a/kokoro/gcp_ubuntu/bazel/integrations/common.cfg
+++ b/kokoro/gcp_ubuntu/bazel/integrations/common.cfg
@@ -17,4 +17,4 @@
 # Common configuration for Kokoro builds that run the integrations build with
 # bazel on linux.
 
-build_file: "iree/kokoro/gcp_ubuntu/bazel/integrations/build_kokoro.sh"
+build_file: "iree/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86/integrations/build_kokoro.sh"
diff --git a/kokoro/gcp_ubuntu/cmake/android/arm64-v8a/common.cfg b/kokoro/gcp_ubuntu/cmake/android/arm64-v8a/common.cfg
index 8159f36..1376e08 100644
--- a/kokoro/gcp_ubuntu/cmake/android/arm64-v8a/common.cfg
+++ b/kokoro/gcp_ubuntu/cmake/android/arm64-v8a/common.cfg
@@ -17,4 +17,4 @@
 # Common configuration for Kokoro builds that cross-compile IREE towards
 # Android arm64-v8a using CMake.
 
-build_file: "iree/kokoro/gcp_ubuntu/cmake/android/arm64-v8a/build_kokoro.sh"
+build_file: "iree/build_tools/kokoro/gcp_ubuntu/cmake/android/arm64-v8a/build_kokoro.sh"
diff --git a/kokoro/gcp_ubuntu/cmake/common.cfg b/kokoro/gcp_ubuntu/cmake/common.cfg
index 838f4a5..49e6865 100644
--- a/kokoro/gcp_ubuntu/cmake/common.cfg
+++ b/kokoro/gcp_ubuntu/cmake/common.cfg
@@ -16,4 +16,4 @@
 
 # Common configuration for Kokoro builds that run cmake on linux.
 
-build_file: "iree/kokoro/gcp_ubuntu/cmake/build_kokoro.sh"
+build_file: "iree/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86/build_kokoro.sh"
diff --git a/kokoro/gcp_ubuntu/cmake/linux/x86-turing/common.cfg b/kokoro/gcp_ubuntu/cmake/linux/x86-turing/common.cfg
index 9e6847d..bdb9163 100644
--- a/kokoro/gcp_ubuntu/cmake/linux/x86-turing/common.cfg
+++ b/kokoro/gcp_ubuntu/cmake/linux/x86-turing/common.cfg
@@ -16,4 +16,4 @@
 
 # Common configuration for Kokoro builds that run cmake on linux.
 
-build_file: "iree/kokoro/gcp_ubuntu/cmake/linux/x86-turing/build_kokoro.sh"
+build_file: "iree/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-turing/build_kokoro.sh"
diff --git a/packaging/python/common_setup.py b/packaging/python/common_setup.py
index cbaf1d1..149dbdf 100644
--- a/packaging/python/common_setup.py
+++ b/packaging/python/common_setup.py
@@ -16,6 +16,7 @@
 import platform
 import setuptools
 import sys
+import sysconfig
 from datetime import date
 
 
@@ -99,15 +100,6 @@
   }
 
 
-def get_native_file_extension():
-  if platform.system() == "Windows":
-    return "pyd"
-  elif platform.system() == "Darwin":
-    return "dylib"
-  else:
-    return "so"
-
-
 def setup(**kwargs):
   # See: https://stackoverflow.com/q/45150304
   try:
@@ -128,7 +120,7 @@
   # Unfortunately, bazel is imprecise and scatters .so files around, so
   # need to be specific.
   package_data = {
-      "": ["*.%s" % (get_native_file_extension(),)],
+      "": ["*%s" % (sysconfig.get_config_var("EXT_SUFFIX"),)],
   }
   setuptools.setup(
       package_data=package_data,
diff --git a/packaging/python/setup_compiler.py b/packaging/python/setup_compiler.py
index c33d7fd..ee9e87d 100644
--- a/packaging/python/setup_compiler.py
+++ b/packaging/python/setup_compiler.py
@@ -39,7 +39,12 @@
   print("Found packages:", packages)
   setup_kwargs = common_setup.get_setup_defaults(
       sub_project="compiler", description="IREE Generic Compiler")
-  common_setup.setup(packages=packages, **setup_kwargs)
+  common_setup.setup(
+      packages=packages,
+      ext_modules=[
+          setuptools.Extension(name="pyiree.compiler.binding", sources=[]),
+      ],
+      **setup_kwargs)
 
 
 if __name__ == "__main__":
diff --git a/packaging/python/setup_rt.py b/packaging/python/setup_rt.py
index ef39248..af30fd6 100644
--- a/packaging/python/setup_rt.py
+++ b/packaging/python/setup_rt.py
@@ -35,7 +35,12 @@
   setup_kwargs = common_setup.get_setup_defaults(
       sub_project="rt",
       description="IREE Runtime Components (for executing compiled programs)")
-  common_setup.setup(packages=packages, **setup_kwargs)
+  common_setup.setup(
+      packages=packages,
+      ext_modules=[
+          setuptools.Extension(name="pyiree.rt.binding", sources=[]),
+      ],
+      **setup_kwargs)
 
 
 if __name__ == "__main__":
diff --git a/packaging/python/setup_tf.py b/packaging/python/setup_tf.py
index 6f97d70..9eccf4f 100644
--- a/packaging/python/setup_tf.py
+++ b/packaging/python/setup_tf.py
@@ -45,7 +45,12 @@
       sub_project="tf",
       description="IREE TensorFlow Compiler",
       package_dir=package_dir)
-  common_setup.setup(packages=packages, **setup_kwargs)
+  common_setup.setup(
+      packages=packages,
+      ext_modules=[
+          setuptools.Extension(name="pyiree.tf.compiler.binding", sources=[]),
+      ],
+      **setup_kwargs)
 
 
 if __name__ == "__main__":
diff --git a/scripts/prepare_doc_publication.py b/scripts/prepare_doc_publication.py
index a4bbad9..cc396dc 100755
--- a/scripts/prepare_doc_publication.py
+++ b/scripts/prepare_doc_publication.py
@@ -59,12 +59,8 @@
     'getting_started_android_cmake.md': 'Android with CMake',
     'generic_vulkan_env_setup.md': 'Generic Vulkan Setup',
     'getting_started_python.md': 'Python',
-    'cmake_options_and_variables.md': 'CMake Options and Variables',
-    'op_coverage.md': 'XLA HLO Operation Coverage',
-    'e2e_coverage.md': 'TensorFlow E2E Coverage',
-    'roadmap.md': 'Short-term Focus Areas',
-    'roadmap_design.md': 'Long-term Design Roadmap',
-    'iree_community.md': 'Community',
+    'milestones.md': 'Short-term Focus Areas',
+    'design_roadmap.md': 'Long-term Design Roadmap',
 }
 
 # A dictionary containing source file to permanent link mappings.
@@ -75,25 +71,6 @@
 # allows one to override the permanent link if necessary.
 PERMALINK_DICT = {
     'index.md': '/',
-    'getting_started_linux_bazel.md': 'GetStarted/LinuxBazel',
-    'getting_started_linux_cmake.md': 'GetStarted/LinuxCMake',
-    'getting_started_linux_vulkan.md': 'GetStarted/LinuxVulkan',
-    'getting_started_windows_bazel.md': 'GetStarted/WindowsBazel',
-    'getting_started_windows_cmake.md': 'GetStarted/WindowsCMake',
-    'getting_started_windows_vulkan.md': 'GetStarted/WindowsVulkan',
-    'getting_started_macos_cmake.md': 'GetStarted/macOSCMake',
-    'getting_started_macos_vulkan.md': 'GetStarted/macOSVulkan',
-    'getting_started_android_cmake.md': 'GetStarted/AndroidCMake',
-    'generic_vulkan_env_setup.md': 'GetStarted/GenericVulkanSetup',
-    'getting_started_python.md': 'GetStarted/Python',
-    'cmake_options_and_variables.md': 'GetStarted/CMakeOptionsVariables',
-    'developer_overview.md': 'DeveloperOverview',
-    'testing_guide.md': 'TestingGuide',
-    'op_coverage.md': 'HLOOpCoverage',
-    'e2e_coverage.md': 'TensorFlowE2ECoverage',
-    'roadmap.md': 'FocusAreas',
-    'roadmap_design.md': 'DesignRoadmap',
-    'iree_community.md': 'Community',
 }
 
 # A dictionary containing source file to navigation order mappings.
@@ -102,15 +79,18 @@
 # the left panel of https://google.github.io/iree website. This allows one
 # to specify an order for a specific doc.
 NAVI_ORDER_DICT = {
+    # Top level entries
     'index.md': 1,
-    # 'Getting Started' is 2.
-    'developer_overview.md': 3,
-    'roadmap_design.md': 4,
-    'roadmap.md': 5,
-    'op_coverage.md': 6,
-    'e2e_coverage.md': 7,
-    'testing_guide.md': 8,
+    # 'Using IREE' is 2.
+    # 'Getting Started' is 3.
+    # 'Developing IREE' is 4.
+    'design_roadmap.md': 5,
+    'milestones.md': 6,
+    'xla_op_coverage.md': 7,
+    'tf_e2e_coverage.md': 8,
     'iree_community.md': 9,
+    # 'Design Docs' is 10.
+    # 'Dialect Definitions' is 11.
 
     # Within 'Getting Started' use explicit ordering.
     # Alphabetical would put 'bazel' before 'cmake' and 'python' between 'linux'
@@ -127,6 +107,16 @@
     'getting_started_python.md': 10,
     'generic_vulkan_env_setup.md': 11,
     'cmake_options_and_variables.md': 12,
+
+    # Within 'Developing IREE' use explicit ordering.
+    'developer_overview.md': 1,
+    'contributor_tips.md': 2,
+    'testing_guide.md': 3,
+    'benchmarking.md': 4,
+    'repository_management.md': 5,
+
+    # Within 'Using IREE' use explicit ordering.
+    'using_colab.md': 1,
 }
 
 # A dictionary containing source directory to section tile mappings.
@@ -137,14 +127,17 @@
 # Note that the title here must match with index.md file's title under the
 # subdirectory.
 DIRECTORY_TITLE_DICT = {
+    'design_docs': 'Design Docs',
+    'developing_iree': 'Developing IREE',
     'Dialects': 'Dialect Definitions',
-    'GetStarted': 'Getting Started',
+    'get_started': 'Getting Started',
+    'using_iree': 'Using IREE',
 }
 
 # A dictionary containing the supporting JavaScript files for each doc.
 JS_FILES_DICT = {
-    'op_coverage.md': ['js/add_classes.js'],
-    'e2e_coverage.md': ['js/add_classes.js'],
+    'xla_op_coverage.md': ['js/add_classes.js'],
+    'tf_e2e_coverage.md': ['js/add_classes.js'],
 }
 
 
@@ -164,19 +157,20 @@
   # Use the default layout for everything.
   front_matter['layout'] = 'default'
   # Use the base filename as permanent link.
-  front_matter['permalink'] = base_name
+  # Replace '_' with '-'. Underscores are not typical in URLs...
+  front_matter['permalink'] = base_name.replace('_', '-')
 
   # Organize each doc to a section matching its directory structure.
   if relpath and relpath != '.':
-    front_matter['parent'] = relpath
-    front_matter['permalink'] = f'{relpath}/{front_matter["permalink"]}'
+    hyphen_relpath = relpath.replace('_', '-')
+    front_matter['permalink'] = f'{hyphen_relpath}/{front_matter["permalink"]}'
 
   # Find the title and TOC.
   lines = content.splitlines()
   title_line_index = None
   toc_index = None
   for (index, line) in enumerate(lines):
-    if line.startswith('# '):
+    if line.startswith('# ') and title_line_index is None:
       title_line_index = index
     if line == '[TOC]':
       toc_index = index
diff --git a/scripts/update_e2e_coverage.py b/scripts/update_e2e_coverage.py
index ea691e4..e9cf397 100755
--- a/scripts/update_e2e_coverage.py
+++ b/scripts/update_e2e_coverage.py
@@ -64,6 +64,7 @@
 - vulkan-spirv
 
 The table shows the supported TensorFlow functions and models on each backend.
+It is auto-generated from IREE's test status.
 
 """
 
@@ -172,7 +173,7 @@
     content.append(generate_table(test_suite))
   content = '\n\n'.join(content) + '\n'  # Trailing newline.
 
-  table_path = os.path.join(args.build_dir, 'doc', 'e2e_coverage.md')
+  table_path = os.path.join(args.build_dir, 'doc', 'tf_e2e_coverage.md')
   with open(table_path, 'w', encoding='utf-8') as f:
     f.write(E2E_COVERAGE_DESCRIPTION)
     f.write(content)
diff --git a/scripts/update_op_coverage.py b/scripts/update_op_coverage.py
index b01b687..c2c6dd9 100755
--- a/scripts/update_op_coverage.py
+++ b/scripts/update_op_coverage.py
@@ -30,14 +30,15 @@
 E2E_XLA_OPS_PATH = 'iree/test/e2e/xla_ops'
 
 # TODO(scotttodd): LLVM AOT (dylib-llvm-aot) HAL target(s)
-OP_COVERAGE_DESCRIPTION = """# HLO Op Coverage
+OP_COVERAGE_DESCRIPTION = """# XLA HLO Op Coverage
 There are three backend [targets](https://github.com/google/iree/tree/main/iree/compiler/Dialect/HAL/Target) in IREE:
 
 - vmla
 - llvm-ir
 - vulkan-spirv
 
-The table shows the supported XLA HLO ops on each backend.
+The table shows the supported XLA HLO ops on each backend. It is auto-generated
+from IREE's test status.
 
 """
 
@@ -117,7 +118,7 @@
 if __name__ == '__main__':
   args = parse_arguments()
   content = generate_table(args.build_dir)
-  table_path = os.path.join(args.build_dir, 'doc', 'op_coverage.md')
+  table_path = os.path.join(args.build_dir, 'doc', 'xla_op_coverage.md')
   with open(table_path, 'w', encoding='utf-8') as f:
     f.write(OP_COVERAGE_DESCRIPTION)
     f.write(content)
diff --git a/third_party/llvm-project b/third_party/llvm-project
index 7ca9b58..eed3331 160000
--- a/third_party/llvm-project
+++ b/third_party/llvm-project
@@ -1 +1 @@
-Subproject commit 7ca9b589c45302feb28c0b3b0e80088c0901bb40
+Subproject commit eed333149d178b69fdaf39b9419b7ca032520182
diff --git a/third_party/mlir-emitc b/third_party/mlir-emitc
index 67f3cce..80885f8 160000
--- a/third_party/mlir-emitc
+++ b/third_party/mlir-emitc
@@ -1 +1 @@
-Subproject commit 67f3ccebee84f3488b46a8d3ac005178c52ff264
+Subproject commit 80885f899e12d55a45561ef758eea47bb340dbf1
diff --git a/third_party/sdl2 b/third_party/sdl2
index b73f111..a1390ed 160000
--- a/third_party/sdl2
+++ b/third_party/sdl2
@@ -1 +1 @@
-Subproject commit b73f111094da3e380a1774b56b15f16c90ae8e23
+Subproject commit a1390ed39ec77ecfb574bc6fcd5bfc5e3adbdea9
diff --git a/third_party/tensorflow b/third_party/tensorflow
index e36aca0..e29e1f4 160000
--- a/third_party/tensorflow
+++ b/third_party/tensorflow
@@ -1 +1 @@
-Subproject commit e36aca0132fbcde0bc820d56185e3078f97a879d
+Subproject commit e29e1f4e574caab071e93cfb91fa9ee0944cd87c