Merge pull request #2281 from ScottTodd:docs-iree-opt PiperOrigin-RevId: 318352808

diff --git a/.bazelrc b/.bazelrc
index 3487c50..8d422f7 100644
--- a/.bazelrc
+++ b/.bazelrc

@@ -29,7 +29,7 @@
 test --test_output=errors
 
 ###############################################################################
-# Options for "generic_clang" builds: these options should generally apply to 
+# Options for "generic_clang" builds: these options should generally apply to
 # either clang or gcc and are curated based on need.
 ###############################################################################
 
@@ -43,6 +43,10 @@
 build:generic_clang --copt=-Wno-invalid-offsetof
 build:generic_clang --copt=-Wno-unused-function
 
+# Enable warnings we do care about.
+build:generic_clang --copt=-Wimplicit-fallthrough
+build:generic_clang --copt=-Wthread-safety-analysis
+
 # C++14 standard version is required.
 build:generic_clang --cxxopt=-std=c++14 --host_cxxopt=-std=c++14
 
@@ -190,4 +194,3 @@
 # The user.bazelrc file is not checked in but available for local mods.
 # Always keep this at the end of the file so that user flags override.
 try-import %workspace%/user.bazelrc
-

diff --git a/.github/workflows/synchronize_submodules.yml b/.github/workflows/synchronize_submodules.yml
index a27e7ed..f56af6b 100644
--- a/.github/workflows/synchronize_submodules.yml
+++ b/.github/workflows/synchronize_submodules.yml

@@ -41,7 +41,7 @@
       - name: Committing updates
         if: env.has_diff == 'true'
         run: |
-          git config --local user.email "noreply+action@github.com"
+          git config --local user.email "iree-github-actions-bot@google.com"
           git config --local user.name "Submodule Synchronize Action"
           git commit -am "Synchronize submodules"
       - name: Pushing changes

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 501cd0a..004bed5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt

@@ -33,6 +33,7 @@
 # Project component configuration
 #-------------------------------------------------------------------------------
 
+# LINT.IfChange(iree_options)
 option(IREE_ENABLE_RUNTIME_TRACING "Enables instrumented runtime tracing." OFF)
 option(IREE_ENABLE_MLIR "Enables MLIR/LLVM dependencies." ON)
 option(IREE_ENABLE_EMITC "Enables MLIR EmitC dependencies." OFF)
@@ -50,6 +51,10 @@
   CACHE STRING "Semicolon-separated list of HAL drivers to build, or \"all\"." FORCE)
 set(IREE_TARGET_BACKENDS_TO_BUILD "all"
   CACHE STRING "Semicolon-separated list of target backends to build, or \"all\"." FORCE)
+# LINT.ThenChange(
+#   https://github.com/google/iree/tree/master/build_tools/cmake/iree_cross_compile.cmake:iree_cross_compile_options,
+#   https://github.com/google/iree/tree/master/build_tools/cmake/iree_cross_compile.cmake:iree_cross_compile_invoke
+# )
 
 if(${IREE_BUILD_SAMPLES} OR ${IREE_BUILD_EXPERIMENTAL})
   set(IREE_BUILD_COMPILER ON CACHE BOOL "Build the IREE compiler for sample projects." FORCE)
@@ -136,6 +141,44 @@
 )
 
 #-------------------------------------------------------------------------------
+# Cross compiling configuration
+#-------------------------------------------------------------------------------
+
+if(CMAKE_CROSSCOMPILING)
+  if(CMAKE_HOST_SYSTEM_NAME STREQUAL Windows)
+    message(FATAL_ERROR "Cross compilation with Windows host system is not supported yet")
+  endif()
+
+  message(STATUS "Detected cross compilation mode; configuring IREE on host...")
+
+  # C/C++ compilers for host compilation.
+  # Note: we need to explicitly set this because IREE does not work well with
+  # GCC at the moment: https://github.com/google/iree/issues/1269
+  set(IREE_HOST_C_COMPILER "$ENV{IREE_HOST_C_COMPILER}" CACHE FILEPATH "C compiler for host compilation")
+  set(IREE_HOST_CXX_COMPILER "$ENV{IREE_HOST_CXX_COMPILER}" CACHE FILEPATH "C++ compiler for host compilation")
+
+  # Master configuration for the binary directory containing all artifacts
+  # compiled for host.
+  if(NOT IREE_HOST_BINARY_ROOT)
+    set(IREE_HOST_BINARY_ROOT "${CMAKE_CURRENT_BINARY_DIR}/host" CACHE FILEPATH "directory containing host artifacts")
+  endif()
+
+  set(IREE_HOST_BUILD_COMPILER ON) # For iree-translate
+  set(IREE_HOST_ENABLE_LLVM ON)    # For iree-tblgen
+
+  # Set the host build directory for LLVM to our directory. Otherwise it will
+  # follow its own convention.
+  set(LLVM_NATIVE_BUILD "${IREE_HOST_BINARY_ROOT}/third_party/llvm-project/llvm")
+
+  include(iree_cross_compile)
+
+  # Use another CMake invocation to configure a build for host.
+  iree_create_configuration(HOST)
+
+  message(STATUS "Done configuring IREE on host in ${IREE_HOST_BINARY_ROOT}")
+endif()
+
+#-------------------------------------------------------------------------------
 # IREE utility definitions
 #-------------------------------------------------------------------------------
 
@@ -291,6 +334,24 @@
 add_subdirectory(build_tools/third_party/renderdoc_api EXCLUDE_FROM_ALL)
 add_subdirectory(build_tools/third_party/vulkan_extensionlayer EXCLUDE_FROM_ALL)
 
+if(CMAKE_CROSSCOMPILING)
+  # We need flatc to generate some source code. When cross-compiling, we need
+  # to make sure the flatc binary is configured under host environment.
+  iree_declare_host_excutable(flatc BUILDONLY)
+
+  # Set the FLATBUFFERS_FLATC_EXECUTABLE. It controls where to find the flatc
+  # binary in BuildFlatBuffers().
+  iree_get_executable_path(FLATBUFFERS_FLATC_EXECUTABLE flatc)
+
+  # Add a custom target to copy the flatc to the binary directory.
+  add_custom_target(iree_host_flatc
+    COMMAND "${CMAKE_COMMAND}" -E copy_if_different
+      "${IREE_HOST_BINARY_ROOT}/third_party/flatbuffers/flatc" "${IREE_HOST_BINARY_ROOT}/bin"
+    DEPENDS iree_host_build_flatc
+    COMMENT "Installing host flatc..."
+  )
+endif()
+
 if(${IREE_BUILD_COMPILER})
   add_subdirectory(build_tools/third_party/tensorflow/tensorflow/compiler/mlir/xla EXCLUDE_FROM_ALL)
 endif()

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index ef46ddc..117179d 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md

@@ -50,6 +50,11 @@
 write these as you would a helpful commit message. Please also keep PRs small
 (focused on a single issue) to streamline review and ease later culprit-finding.
 
+As part of a migration to make the project GitHub-first, our default branch is
+currently called `google` and all PRs should be directed there. This is an
+intermediate state. See
+https://groups.google.com/d/msg/iree-discuss/F07vsG9Ah4o/uAIusKO-BQAJ
+
 Our documentation on
 [repository management](https://github.com/google/iree/blob/master/docs/repository_management.md)
 has more information on some of the oddities in our repository setup and

diff --git a/README.md b/README.md
index c6df57e..84eccc6 100644
--- a/README.md
+++ b/README.md

@@ -112,10 +112,10 @@
 
 CI System | Build System | Platform | Component       | Status
 :-------: | :----------: | :------: | :-------------: | :----:
-Kokoro    | Bazel        | Linux    | Core            | [![kokoro-status-bazel-linux](https://storage.googleapis.com/iree-oss-build-badges/linux/bazel/core/build_status.svg)](https://storage.googleapis.com/iree-oss-build-badges/linux/bazel/core/build_result.html)
-Kokoro    | Bazel        | Linux    | Bindings        | [![kokoro-status-bazel-linux](https://storage.googleapis.com/iree-oss-build-badges/linux/bazel/bindings/build_status.svg)](https://storage.googleapis.com/iree-oss-build-badges/linux/bazel/bindings/build_result.html)
-Kokoro    | Bazel        | Linux    | Integrations    | [![kokoro-status-bazel-linux](https://storage.googleapis.com/iree-oss-build-badges/linux/bazel/integrations/build_status.svg)](https://storage.googleapis.com/iree-oss-build-badges/linux/bazel/integrations/build_result.html)
-Kokoro    | CMake        | Linux    | Core + Bindings | [![kokoro-status-cmake-linux](https://storage.googleapis.com/iree-oss-build-badges/linux/cmake/build_status.svg)](https://storage.googleapis.com/iree-oss-build-badges/linux/cmake/build_result.html)
+Kokoro    | Bazel        | Linux    | Core            | [![kokoro-status-bazel-linux](https://storage.googleapis.com/iree-oss-build-badges/linux/bazel/core/google_status.svg)](https://storage.googleapis.com/iree-oss-build-badges/linux/bazel/core/google_result.html)
+Kokoro    | Bazel        | Linux    | Bindings        | [![kokoro-status-bazel-linux](https://storage.googleapis.com/iree-oss-build-badges/linux/bazel/bindings/google_status.svg)](https://storage.googleapis.com/iree-oss-build-badges/linux/bazel/bindings/google_result.html)
+Kokoro    | Bazel        | Linux    | Integrations    | [![kokoro-status-bazel-linux](https://storage.googleapis.com/iree-oss-build-badges/linux/bazel/integrations/google_status.svg)](https://storage.googleapis.com/iree-oss-build-badges/linux/bazel/integrations/google_result.html)
+Kokoro    | CMake        | Linux    | Core + Bindings | [![kokoro-status-cmake-linux](https://storage.googleapis.com/iree-oss-build-badges/linux/cmake/google_status.svg)](https://storage.googleapis.com/iree-oss-build-badges/linux/cmake/google_result.html)
 
 ## License
 

diff --git a/SUBMODULE_VERSIONS b/SUBMODULE_VERSIONS
index bc5a8a1..19c9321 100644
--- a/SUBMODULE_VERSIONS
+++ b/SUBMODULE_VERSIONS

@@ -3,7 +3,7 @@
 4c13807b7d43ff0946b7ffea0ae3aee9e611d778 third_party/dear_imgui
 a5d9d0f7d368054fd1691aedf1db4116efcc233e third_party/flatbuffers
 f2fb48c3b3d79a75a88a99fba6576b25d42ec528 third_party/googletest
-7e825abd5704ce28b166f9463d4bd304348fd2a9 third_party/llvm-project
+9fb7e98db5aaef617878a127b663efa4d01aa834 third_party/llvm-project
 17b12a4481daa150e2d1ea3ada086b551b856707 third_party/marl
 67f3ccebee84f3488b46a8d3ac005178c52ff264 third_party/mlir-emitc
 80d452484c5409444b0ec19383faa84bb7a4d351 third_party/pybind11
@@ -11,7 +11,7 @@
 b73f111094da3e380a1774b56b15f16c90ae8e23 third_party/sdl2
 f8bf11a0253a32375c32cad92c841237b96696c0 third_party/spirv_headers
 57eb48aed36160c4876bc8310d9ca84d42ee9e2a third_party/swiftshader
-b00a7808a7b29a78762b54e29aac87a77254b4b6 third_party/tensorflow
+f74654ac7b314a212b1df6687c2f99800084e97f third_party/tensorflow
 864d86e8b6d21449474db5e9313dbff90aa9c24f third_party/tracy
 8a457f8552d8d47ce3a96ed80a714ff6396f8ad8 third_party/vulkan_extensionlayer
 9bd3f561bcee3f01d22912de10bb07ce4e23d378 third_party/vulkan_headers

diff --git a/bindings/python/pyiree/rt/function_abi.cc b/bindings/python/pyiree/rt/function_abi.cc
index c97b3c8..6b8c01d 100644
--- a/bindings/python/pyiree/rt/function_abi.cc
+++ b/bindings/python/pyiree/rt/function_abi.cc

@@ -164,6 +164,48 @@
   }
 }
 
+void PackScalar(const RawSignatureParser::Description& desc, py::handle py_arg,
+                VmVariantList& f_args) {
+  iree_vm_value value;
+  value.type = IREE_VM_VALUE_TYPE_I32;
+  switch (desc.scalar.type) {
+    case AbiConstants::ScalarType::kUint8:
+    case AbiConstants::ScalarType::kUint16:
+    case AbiConstants::ScalarType::kUint32: {
+      value.i32 = py_arg.cast<int32_t>();
+      break;
+    }
+    case AbiConstants::ScalarType::kSint8:
+    case AbiConstants::ScalarType::kSint16:
+    case AbiConstants::ScalarType::kSint32: {
+      value.i32 = py_arg.cast<int32_t>();
+      break;
+    }
+    default:
+      throw RaisePyError(PyExc_NotImplementedError, "Unsupported scalar type");
+  }
+  CheckApiStatus(iree_vm_variant_list_append_value(f_args.raw_ptr(), value),
+                 "Could not pack scalar argument");
+}
+
+py::object UnpackScalar(const RawSignatureParser::Description& desc,
+                        iree_vm_variant_t& f_result) {
+  switch (desc.scalar.type) {
+    case AbiConstants::ScalarType::kUint8:
+    case AbiConstants::ScalarType::kUint16:
+    case AbiConstants::ScalarType::kUint32: {
+      return py::int_(static_cast<uint32_t>(f_result.i32));
+    }
+    case AbiConstants::ScalarType::kSint8:
+    case AbiConstants::ScalarType::kSint16:
+    case AbiConstants::ScalarType::kSint32: {
+      return py::int_(f_result.i32);
+    }
+    default:
+      throw RaisePyError(PyExc_NotImplementedError, "Unsupported scalar type");
+  }
+}
+
 }  // namespace
 
 //------------------------------------------------------------------------------
@@ -236,6 +278,9 @@
         throw RaisePyError(PyExc_NotImplementedError,
                            "Ref objects not yet supported");
         break;
+      case RawSignatureParser::Type::kScalar:
+        PackScalar(desc, py_args[i], f_args);
+        break;
       default:
         throw RaisePyError(PyExc_NotImplementedError,
                            "Unsupported argument type");
@@ -294,9 +339,12 @@
         throw RaisePyError(PyExc_NotImplementedError,
                            "Ref objects not yet supported");
         break;
+      case RawSignatureParser::Type::kScalar:
+        py_results[i] = UnpackScalar(desc, *f_result);
+        break;
       default:
         throw RaisePyError(PyExc_NotImplementedError,
-                           "Unsupported argument type");
+                           "Unsupported result type");
     }
   }
 }
@@ -358,9 +406,11 @@
         throw RaisePyError(PyExc_NotImplementedError,
                            "Ref objects not yet supported");
         break;
+      case RawSignatureParser::Type::kScalar:
+        break;
       default:
         throw RaisePyError(PyExc_NotImplementedError,
-                           "Unsupported argument type");
+                           "Unsupported allocation argument type");
     }
   }
 }

diff --git a/bindings/python/pyiree/rt/vm_test.py b/bindings/python/pyiree/rt/vm_test.py
index ed7e66f..6b633ce 100644
--- a/bindings/python/pyiree/rt/vm_test.py
+++ b/bindings/python/pyiree/rt/vm_test.py

@@ -21,6 +21,19 @@
 from pyiree import rt
 
 
+def create_add_scalar_module():
+  ctx = compiler.Context()
+  input_module = ctx.parse_asm("""
+    func @add_scalar(%arg0: i32, %arg1: i32) -> i32 attributes { iree.module.export } {
+      %0 = addi %arg0, %arg1 : i32
+      return %0 : i32
+    }
+    """)
+  binary = input_module.compile()
+  m = rt.VmModule.from_flatbuffer(binary)
+  return m
+
+
 def create_simple_static_mul_module():
   ctx = compiler.Context()
   input_module = ctx.parse_asm("""
@@ -103,6 +116,26 @@
     context = rt.VmContext(instance, modules=[self.hal_module, m])
     print(context)
 
+  def test_add_scalar(self):
+    m = create_add_scalar_module()
+    instance = rt.VmInstance()
+    context = rt.VmContext(instance, modules=[self.hal_module, m])
+    f = m.lookup_function("add_scalar")
+    abi = context.create_function_abi(self.device, self.htf, f)
+    print("INVOKING:", abi)
+    arg0 = np.array([1., 2., 3., 4.], dtype=np.float32)
+    arg1 = np.array([4., 5., 6., 7.], dtype=np.float32)
+    inputs = abi.raw_pack_inputs((5, 6))
+    print("INPUTS:", inputs)
+    allocated_results = abi.allocate_results(inputs, static_alloc=False)
+    print("ALLOCATED RESULTS:", allocated_results)
+    print("--- INVOKE:")
+    context.invoke(f, inputs, allocated_results)
+    print("--- DONE.")
+    results = abi.raw_unpack_results(allocated_results)
+    print("RESULTS:", results)
+    self.assertEqual(results[0], 11)
+
   def test_synchronous_dynamic_shape_invoke_function(self):
     m = create_simple_dynamic_abs_module()
     instance = rt.VmInstance()

diff --git a/build_tools/bazel/third_party_import/llvm-project/overlay/llvm/BUILD.bazel b/build_tools/bazel/third_party_import/llvm-project/overlay/llvm/BUILD.bazel
index 50b71a3..75a5326 100644
--- a/build_tools/bazel/third_party_import/llvm-project/overlay/llvm/BUILD.bazel
+++ b/build_tools/bazel/third_party_import/llvm-project/overlay/llvm/BUILD.bazel

@@ -561,6 +561,7 @@
     name = "common_target_td_sources",
     srcs = glob([
         "include/llvm/CodeGen/*.td",
+        "include/llvm/Frontend/Directive/*.td",
         "include/llvm/IR/Intrinsics*.td",
         "include/llvm/TableGen/*.td",
         "include/llvm/Target/*.td",
@@ -666,6 +667,17 @@
     ],
 )
 
+gentbl(
+    name = "omp_gen",
+    tbl_outs = [("--gen-directive-decls", "include/llvm/Frontend/OpenMP/OMP.h.inc")],
+    tblgen = ":llvm-tblgen",
+    td_file = "include/llvm/Frontend/OpenMP/OMP.td",
+    td_srcs = glob([
+        "include/llvm/Frontend/OpenMP/*.td",
+        "include/llvm/Frontend/Directive/*.td",
+    ]),
+)
+
 ########################## Begin generated content ##########################
 cc_library(
     name = "AArch64AsmParser",
@@ -698,6 +710,7 @@
         "lib/Target/AArch64/*.c",
         "lib/Target/AArch64/*.cpp",
         "lib/Target/AArch64/*.inc",
+        "lib/Target/AArch64/GISel/*.cpp",
     ]),
     hdrs = glob([
         "include/llvm/Target/AArch64/*.h",
@@ -1382,14 +1395,20 @@
 
 cc_library(
     name = "Analysis",
-    srcs = glob([
-        "lib/Analysis/*.c",
-        "lib/Analysis/*.cpp",
-        "lib/Analysis/*.inc",
-        "include/llvm/Transforms/Utils/Local.h",
-        "include/llvm/Transforms/Scalar.h",
-        "lib/Analysis/*.h",
-    ]),
+    srcs = glob(
+        [
+            "lib/Analysis/*.c",
+            "lib/Analysis/*.cpp",
+            "lib/Analysis/*.inc",
+            "include/llvm/Transforms/Utils/Local.h",
+            "include/llvm/Transforms/Scalar.h",
+            "lib/Analysis/*.h",
+        ],
+        exclude = [
+            "lib/Analysis/MLInlineAdvisor.cpp",
+            "lib/Analysis/ReleaseModeModelRunner.cpp",
+        ],
+    ),
     hdrs = glob([
         "include/llvm/Analysis/*.h",
         "include/llvm/Analysis/*.def",
@@ -2052,6 +2071,7 @@
         ":Support",
         ":TransformUtils",
         ":config",
+        ":omp_gen",
     ],
 )
 

diff --git a/build_tools/bazel/third_party_import/llvm-project/overlay/mlir/BUILD.bazel b/build_tools/bazel/third_party_import/llvm-project/overlay/mlir/BUILD.bazel
index a0d1066..ba9b580 100644
--- a/build_tools/bazel/third_party_import/llvm-project/overlay/mlir/BUILD.bazel
+++ b/build_tools/bazel/third_party_import/llvm-project/overlay/mlir/BUILD.bazel

@@ -282,7 +282,7 @@
     deps = [
         ":AVX512IncGen",
         ":IR",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":VectorOps",
         "@llvm-project//llvm:Core",
         "@llvm-project//llvm:Support",
@@ -305,9 +305,9 @@
         ":IR",
         ":LLVMAVX512",
         ":LLVMDialect",
-        ":LLVMTransforms",
         ":Pass",
         ":StandardOps",
+        ":StandardToLLVM",
         ":Support",
         ":Transforms",
         ":VectorOps",
@@ -489,7 +489,7 @@
         ":EDSC",
         ":IR",
         ":LoopLikeInterface",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":StandardOps",
         ":Support",
         "@llvm-project//llvm:Support",
@@ -571,7 +571,7 @@
 )
 
 cc_library(
-    name = "AffineToStandardTransforms",
+    name = "AffineToStandard",
     srcs = glob([
         "lib/Conversion/AffineToStandard/*.cpp",
         "lib/Conversion/AffineToStandard/*.h",
@@ -591,6 +591,11 @@
     ],
 )
 
+alias(
+    name = "AffineToStandardTransforms",
+    actual = "AffineToStandard",
+)
+
 # SDBM dialect only contains attribute components that can be constructed given
 # a dialect object, so whenever it is used it must also be registered. Therefore
 # we don't split out the registration library for it.
@@ -631,7 +636,7 @@
         ":IR",
         ":LoopLikeInterface",
         ":SCFIncGen",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":StandardOps",
         ":Support",
         "@llvm-project//llvm:Support",
@@ -719,7 +724,7 @@
         ":InferTypeOpInterface",
         ":MLIRShapeCanonicalizationIncGen",
         ":ShapeOpsIncGen",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":Support",
         "@llvm-project//llvm:Support",
     ],
@@ -833,7 +838,7 @@
         ":ControlFlowInterfaces",
         ":EDSC",
         ":IR",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":StandardOpsIncGen",
         ":Support",
         ":ViewLikeInterface",
@@ -895,7 +900,7 @@
         ":DialectUtils",
         ":EDSC",
         ":IR",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":StandardOps",
         ":Support",
         ":VectorOpsIncGen",
@@ -1070,7 +1075,7 @@
         ":ControlFlowInterfaces",
         ":IR",
         ":LLVMOpsIncGen",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":Support",
         "@llvm-project//llvm:AsmParser",
         "@llvm-project//llvm:BitReader",
@@ -1193,7 +1198,7 @@
         ":GPUOpsIncGen",
         ":IR",
         ":LLVMDialect",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":StandardOps",
         ":Support",
     ],
@@ -1271,8 +1276,8 @@
         ":GPUDialect",
         ":IR",
         ":LLVMDialect",
-        ":LLVMTransforms",
         ":StandardOps",
+        ":StandardToLLVM",
         "@llvm-project//llvm:Support",
     ],
 )
@@ -1311,9 +1316,9 @@
         ":GPUToNVVMGen",
         ":GPUTransforms",
         ":IR",
-        ":LLVMTransforms",
         ":NVVMDialect",
         ":Pass",
+        ":StandardToLLVM",
         ":Transforms",
         "@llvm-project//llvm:Support",
     ],
@@ -1333,10 +1338,10 @@
         ":ConversionPassIncGen",
         ":GPUDialect",
         ":LLVMDialect",
-        ":LLVMTransforms",
         ":Pass",
         ":ROCDLDialect",
         ":StandardOps",
+        ":StandardToLLVM",
         ":Transforms",
         ":VectorOps",
     ],
@@ -1375,9 +1380,9 @@
         ":GPUDialect",
         ":GPUToROCDLTGen",
         ":GPUTransforms",
-        ":LLVMTransforms",
         ":Pass",
         ":ROCDLDialect",
+        ":StandardToLLVM",
         ":Transforms",
         ":VectorOps",
         ":VectorToLLVM",
@@ -1475,7 +1480,7 @@
         ":SCFDialect",
         ":SPIRVDialect",
         ":SPIRVLowering",
-        ":StandardToSPIRVConversions",
+        ":StandardToSPIRVTransforms",
         ":Support",
         ":Transforms",
     ],
@@ -1496,12 +1501,13 @@
         ":ConversionPassIncGen",
         ":IR",
         ":LLVMDialect",
-        ":LLVMTransforms",
         ":Pass",
         ":SPIRVDialect",
         ":StandardOps",
+        ":StandardToLLVM",
         ":Support",
         ":Transforms",
+        "@llvm-project//llvm:Support",
     ],
 )
 
@@ -1574,7 +1580,7 @@
         ":IR",
         ":LLVMDialect",
         ":NVVMOpsIncGen",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":StandardOps",
         ":Support",
         "@llvm-project//llvm:AsmParser",
@@ -1646,7 +1652,7 @@
         ":IR",
         ":LLVMDialect",
         ":ROCDLOpsIncGen",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":StandardOps",
         ":Support",
         "@llvm-project//llvm:AsmParser",
@@ -1894,7 +1900,7 @@
         ":SPIRVOpsIncGen",
         ":SPIRVSerializationGen",
         ":SPIRVTargetAndABIStructGen",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":Support",
         ":Transforms",
         "@llvm-project//llvm:Support",
@@ -1947,7 +1953,7 @@
 )
 
 cc_library(
-    name = "StandardToSPIRVConversions",
+    name = "StandardToSPIRVTransforms",
     srcs = glob([
         "lib/Conversion/StandardToSPIRV/*.cpp",
         "lib/Conversion/StandardToSPIRV/*.h",
@@ -1968,10 +1974,16 @@
         ":StandardOps",
         ":Support",
         ":Transforms",
+        ":VectorOps",
         "@llvm-project//llvm:Support",
     ],
 )
 
+alias(
+    name = "StandardToSPIRVConversions",
+    actual = "StandardToSPIRVTransforms",
+)
+
 cc_library(
     name = "SPIRVSerialization",
     srcs = glob(
@@ -2033,7 +2045,7 @@
         ":IR",
         ":LoopLikeInterface",
         ":SCFDialect",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":StandardOps",
         ":Support",
         "@llvm-project//llvm:Support",
@@ -2152,7 +2164,7 @@
         ":LoopLikeInterface",
         ":Pass",
         ":SCFDialect",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":StandardOps",
         ":Support",
         ":TransformUtils",
@@ -2182,7 +2194,7 @@
     includes = ["include"],
     deps = [
         ":Affine",
-        ":AffineToStandardTransforms",
+        ":AffineToStandard",
         ":ConversionPassIncGen",
         ":GPUDialect",
         ":GPUTransforms",
@@ -2222,7 +2234,7 @@
 )
 
 cc_library(
-    name = "CFGTransforms",
+    name = "SCFToStandard",
     srcs = [
         "lib/Conversion/PassDetail.h",
         "lib/Conversion/SCFToStandard/SCFToStandard.cpp",
@@ -2244,8 +2256,13 @@
     ],
 )
 
+alias(
+    name = "CFGTransforms",
+    actual = "SCFToStandard",
+)
+
 cc_library(
-    name = "LLVMTransforms",
+    name = "StandardToLLVM",
     srcs = [
         "lib/Conversion/PassDetail.h",
         "lib/Conversion/StandardToLLVM/StandardToLLVM.cpp",
@@ -2269,6 +2286,11 @@
     ],
 )
 
+alias(
+    name = "LLVMTransforms",
+    actual = "StandardToLLVM",
+)
+
 gentbl(
     name = "CallOpInterfacesIncGen",
     strip_include_prefix = "include",
@@ -2401,7 +2423,7 @@
 )
 
 cc_library(
-    name = "SideEffects",
+    name = "SideEffectInterfaces",
     srcs = [
         "lib/Interfaces/SideEffectInterfaces.cpp",
     ],
@@ -2417,6 +2439,11 @@
     ],
 )
 
+alias(
+    name = "SideEffects",
+    actual = "SideEffectInterfaces",
+)
+
 cc_library(
     name = "Analysis",
     srcs = glob(
@@ -2627,7 +2654,6 @@
         ":GPUTransforms",
         ":IR",
         ":LLVMDialect",
-        ":LLVMTransforms",
         ":LinalgToLLVM",
         ":LinalgToSPIRV",
         ":LinalgToStandard",
@@ -2639,7 +2665,8 @@
         ":ShapeToStandard",
         ":ShapeTransforms",
         ":StandardOpsTransforms",
-        ":StandardToSPIRVConversions",
+        ":StandardToLLVM",
+        ":StandardToSPIRVTransforms",
         ":Support",
         ":Transforms",
         ":VectorToLLVM",
@@ -2699,7 +2726,6 @@
         ":Affine",
         ":AffinePassIncGen",
         ":AffineTransforms",
-        ":CFGTransforms",
         ":ConversionPassIncGen",
         ":GPUDialect",
         ":GPUPassIncGen",
@@ -2714,7 +2740,6 @@
         ":LLVMDialect",
         ":LLVMIRTransforms",
         ":LLVMPassIncGen",
-        ":LLVMTransforms",
         ":LinalgOps",
         ":LinalgPassIncGen",
         ":LinalgToLLVM",
@@ -2729,6 +2754,7 @@
         ":ROCDLDialect",
         ":SCFDialect",
         ":SCFToGPUPass",
+        ":SCFToStandard",
         ":SCFTransforms",
         ":SDBM",
         ":SPIRVDialect",
@@ -2743,7 +2769,8 @@
         ":StandardOps",
         ":StandardOpsTransforms",
         ":StandardOpsTransformsPassIncGen",
-        ":StandardToSPIRVConversions",
+        ":StandardToLLVM",
+        ":StandardToSPIRVTransforms",
         ":Transforms",
         ":TransformsPassIncGen",
         ":VectorOps",
@@ -2809,13 +2836,13 @@
     includes = ["include"],
     deps = [
         ":AllPassesAndDialectsNoRegistration",
-        ":CFGTransforms",
         ":ExecutionEngine",
         ":ExecutionEngineUtils",
         ":IR",
         ":LLVMDialect",
         ":Parser",
         ":Pass",
+        ":SCFToStandard",
         ":Support",
         "@llvm-project//llvm:Core",
         "@llvm-project//llvm:OrcJIT",
@@ -2885,7 +2912,7 @@
         ":IR",
         ":Pass",
         ":SPIRVDialect",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":StandardOps",
         ":Support",
         "@llvm-project//llvm:Support",
@@ -2918,10 +2945,10 @@
         ":GPUTransforms",
         ":IR",
         ":LLVMDialect",
-        ":LLVMTransforms",
         ":MlirJitRunner",
         ":NVVMDialect",
         ":Pass",
+        ":StandardToLLVM",
         ":TargetNVVMIR",
         ":Transforms",
         "//devtools/build/runtime:get_runfiles_dir",
@@ -2945,11 +2972,11 @@
         ":GPUToSPIRVTransforms",
         ":GPUToVulkanTransforms",
         ":GPUTransforms",
-        ":LLVMTransforms",
         ":MlirJitRunner",
         ":Pass",
         ":SPIRVDialect",
-        ":StandardToSPIRVConversions",
+        ":StandardToLLVM",
+        ":StandardToSPIRVTransforms",
         "@llvm-project//llvm:Support",
     ],
 )
@@ -3159,7 +3186,7 @@
         ":Pass",
         ":QuantOpsIncGen",
         ":QuantPassIncGen",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":StandardOps",
         "@llvm-project//llvm:Support",
     ],
@@ -3294,18 +3321,18 @@
     ]),
     includes = ["include"],
     deps = [
-        ":AffineToStandardTransforms",
+        ":AffineToStandard",
         ":Analysis",
-        ":CFGTransforms",
         ":ConversionPassIncGen",
         ":EDSC",
         ":IR",
         ":LLVMDialect",
-        ":LLVMTransforms",
         ":LinalgOps",
         ":LinalgTransforms",
         ":Pass",
+        ":SCFToStandard",
         ":StandardOps",
+        ":StandardToLLVM",
         ":Support",
         ":Transforms",
         ":VectorToLLVM",
@@ -3385,7 +3412,7 @@
         ":LinalgOpsIncGen",
         ":LinalgStructuredOpsIncGen",
         ":Parser",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":StandardOps",
         ":Support",
         ":ViewLikeInterface",
@@ -3431,21 +3458,21 @@
     includes = ["include"],
     deps = [
         ":Affine",
-        ":AffineToStandardTransforms",
+        ":AffineToStandard",
         ":Analysis",
-        ":CFGTransforms",
         ":DialectUtils",
         ":EDSC",
         ":IR",
         ":LLVMDialect",
-        ":LLVMTransforms",
         ":LinalgOps",
         ":LinalgPassIncGen",
         ":LinalgStructuredOpsIncGen",
         ":Pass",
         ":SCFDialect",
+        ":SCFToStandard",
         ":SCFTransforms",
         ":StandardOps",
+        ":StandardToLLVM",
         ":Support",
         ":TransformUtils",
         ":Transforms",
@@ -3537,9 +3564,9 @@
         ":EDSC",
         ":IR",
         ":LLVMDialect",
-        ":LLVMTransforms",
         ":Pass",
         ":StandardOps",
+        ":StandardToLLVM",
         ":Support",
         ":Transforms",
         ":VectorOps",
@@ -3564,10 +3591,10 @@
         ":EDSC",
         ":IR",
         ":LLVMDialect",
-        ":LLVMTransforms",
         ":Pass",
         ":SCFDialect",
         ":StandardOps",
+        ":StandardToLLVM",
         ":Support",
         ":Transforms",
         ":VectorOps",

diff --git a/build_tools/cmake/flatbuffer_cc_library.cmake b/build_tools/cmake/flatbuffer_cc_library.cmake
index 6ad2995..febf234 100644
--- a/build_tools/cmake/flatbuffer_cc_library.cmake
+++ b/build_tools/cmake/flatbuffer_cc_library.cmake

@@ -95,18 +95,24 @@
     set(FLATBUFFERS_FLATC_SCHEMA_EXTRA_ARGS ${_RULE_FLATC_ARGS})
   endif()
 
+  set(_GEN_TARGET "${_NAME}_gen")
+
   build_flatbuffers(
     "${_RULE_SRCS}"
     "${IREE_ROOT_DIR}"
-    "${_NAME}_gen"  # custom_target_name
-    "${_RULE_DEPS}" # additional_dependencies
+    "${_GEN_TARGET}" # custom_target_name
+    "${_RULE_DEPS}"  # additional_dependencies
     "${CMAKE_CURRENT_BINARY_DIR}" # generated_include_dir
     "${CMAKE_CURRENT_BINARY_DIR}" # binary_schemas_dir
     "" # copy_text_schemas_dir
   )
 
+  # Add dependency on flatc explicitly. This is needed for cross-compiling
+  # where flatc comes from another CMake invocation for host.
+  iree_add_executable_dependencies(${_GEN_TARGET} flatc)
+
   add_library(${_NAME} INTERFACE)
-  add_dependencies(${_NAME} ${_NAME}_gen)
+  add_dependencies(${_NAME} ${_GEN_TARGET})
   target_include_directories(${_NAME}
     INTERFACE
       "$<BUILD_INTERFACE:${IREE_COMMON_INCLUDE_DIRS}>"

diff --git a/build_tools/cmake/iree_bytecode_module.cmake b/build_tools/cmake/iree_bytecode_module.cmake
index f8002ec..64f8fd0 100644
--- a/build_tools/cmake/iree_bytecode_module.cmake
+++ b/build_tools/cmake/iree_bytecode_module.cmake

@@ -56,23 +56,24 @@
   if(DEFINED _RULE_TRANSLATE_TOOL)
     set(_TRANSLATE_TOOL ${_RULE_TRANSLATE_TOOL})
   else()
-    set(_TRANSLATE_TOOL "iree_tools_iree-translate")
+    set(_TRANSLATE_TOOL "iree-translate")
   endif()
 
-  # Resolve the executable binary path from the target name.
-  set(_TRANSLATE_TOOL_EXECUTABLE $<TARGET_FILE:${_TRANSLATE_TOOL}>)
+  iree_get_executable_path(_TRANSLATE_TOOL_EXECUTABLE ${_TRANSLATE_TOOL})
 
   set(_ARGS "${_FLAGS}")
   list(APPEND _ARGS "${CMAKE_CURRENT_SOURCE_DIR}/${_RULE_SRC}")
   list(APPEND _ARGS "-o")
   list(APPEND _ARGS "${_RULE_NAME}.module")
 
+  # Depending on the binary instead of the target here given we might not have
+  # a target in this CMake invocation when cross-compiling.
   add_custom_command(
     OUTPUT "${_RULE_NAME}.module"
     COMMAND ${_TRANSLATE_TOOL_EXECUTABLE} ${_ARGS}
     # Changes to either the translation tool or the input source should
     # trigger rebuilding.
-    DEPENDS ${_TRANSLATE_TOOL} ${_RULE_SRC}
+    DEPENDS ${_TRANSLATE_TOOL_EXECUTABLE} ${_RULE_SRC}
   )
 
   if(_RULE_TESTONLY)

diff --git a/build_tools/cmake/iree_cc_binary.cmake b/build_tools/cmake/iree_cc_binary.cmake
index b4d6eff..6b3653a 100644
--- a/build_tools/cmake/iree_cc_binary.cmake
+++ b/build_tools/cmake/iree_cc_binary.cmake

@@ -30,6 +30,8 @@
 # COPTS: List of private compile options
 # DEFINES: List of public defines
 # LINKOPTS: List of link options
+# TESTONLY: for testing; won't compile when tests are disabled
+# HOSTONLY: host only; compile using host toolchain when cross-compiling
 #
 # Note:
 # By default, iree_cc_binary will always create a binary named iree_${NAME}.
@@ -58,7 +60,7 @@
 function(iree_cc_binary)
   cmake_parse_arguments(
     _RULE
-    "TESTONLY"
+    "HOSTONLY;TESTONLY"
     "NAME;OUT"
     "SRCS;COPTS;DEFINES;LINKOPTS;DATA;DEPS"
     ${ARGN}
@@ -68,6 +70,14 @@
     return()
   endif()
 
+  if(_RULE_HOSTONLY AND CMAKE_CROSSCOMPILING)
+    # The binary is marked as host only. We need to declare the rules for
+    # generating them under host configuration so cross-compiling towards
+    # target we can still have this binary.
+    iree_declare_host_excutable(${_RULE_NAME})
+    return()
+  endif()
+
   # Prefix the library with the package name, so we get: iree_package_name
   iree_package_name(_PACKAGE_NAME)
   set(_NAME "${_PACKAGE_NAME}_${_RULE_NAME}")
@@ -126,6 +136,11 @@
   # Track target and deps, use in iree_complete_binary_link_options() later.
   set_property(GLOBAL APPEND PROPERTY _IREE_CC_BINARY_NAMES "${_NAME}")
   set_property(TARGET ${_NAME} PROPERTY DIRECT_DEPS ${_RULE_DEPS})
+
+  install(TARGETS ${_NAME}
+          RENAME ${_RULE_NAME}
+          COMPONENT ${_RULE_NAME}
+          RUNTIME DESTINATION bin)
 endfunction()
 
 # Lists all transitive dependencies of DIRECT_DEPS in TRANSITIVE_DEPS.

diff --git a/build_tools/cmake/iree_cc_embed_data.cmake b/build_tools/cmake/iree_cc_embed_data.cmake
index d3644ed..7eeac23 100644
--- a/build_tools/cmake/iree_cc_embed_data.cmake
+++ b/build_tools/cmake/iree_cc_embed_data.cmake

@@ -79,10 +79,12 @@
     list(APPEND _ARGS "${SRC}")
   endforeach(SRC)
 
+  iree_get_executable_path(_EXE_PATH generate_cc_embed_data)
+
   add_custom_command(
     OUTPUT "${_RULE_H_FILE_OUTPUT}" "${_RULE_CC_FILE_OUTPUT}"
-    COMMAND generate_cc_embed_data ${_ARGS}
-    DEPENDS generate_cc_embed_data ${_RULE_SRCS} ${_RULE_GENERATED_SRCS}
+    COMMAND ${_EXE_PATH} ${_ARGS}
+    DEPENDS ${_EXE_PATH} ${_RULE_SRCS} ${_RULE_GENERATED_SRCS}
   )
 
   if(_RULE_TESTONLY)

diff --git a/build_tools/cmake/iree_copts.cmake b/build_tools/cmake/iree_copts.cmake
index cec163a..542536b 100644
--- a/build_tools/cmake/iree_copts.cmake
+++ b/build_tools/cmake/iree_copts.cmake

@@ -12,8 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+#-------------------------------------------------------------------------------
+# Abseil configuration
+#-------------------------------------------------------------------------------
+
 include(AbseilConfigureCopts)
 
+# By default Abseil strips string literals on mobile platforms, which means
+# we cannot run IREE binaries via command-line with proper options. Turn off
+# the stripping.
+# TODO: we might still want to strip when compiling IREE into Android Java apps.
+if(CMAKE_CROSSCOMPILING AND "${CMAKE_SYSTEM_NAME}" MATCHES "Android")
+  add_definitions(-DABSL_FLAGS_STRIP_NAMES=0)
+endif()
+
 #-------------------------------------------------------------------------------
 # C++ used within IREE
 #-------------------------------------------------------------------------------
@@ -42,6 +54,9 @@
     "-Wno-gnu-label-as-value"
     "-Wno-unused-local-typedef"
     "-Wno-gnu-zero-variadic-macro-arguments"
+    # Enable some warnings
+    "-Wimplicit-fallthrough"
+    "-Wthread-safety-analysis"
   CLANG_OR_GCC
     "-Wno-unused-parameter"
     "-Wno-undef"
@@ -89,13 +104,19 @@
 #-------------------------------------------------------------------------------
 
 set(FLATBUFFERS_BUILD_TESTS OFF CACHE BOOL "" FORCE)
-set(FLATBUFFERS_INSTALL OFF CACHE BOOL "" FORCE)
-set(FLATBUFFERS_BUILD_FLATC ON CACHE BOOL "" FORCE)
 set(FLATBUFFERS_BUILD_FLATHASH OFF CACHE BOOL "" FORCE)
 set(FLATBUFFERS_BUILD_GRPCTEST OFF CACHE BOOL "" FORCE)
+set(FLATBUFFERS_INSTALL OFF CACHE BOOL "" FORCE)
 set(FLATBUFFERS_INCLUDE_DIRS
   "${PROJECT_SOURCE_DIR}/third_party/flatbuffers/include/"
 )
+
+if(CMAKE_CROSSCOMPILING)
+  set(FLATBUFFERS_BUILD_FLATC OFF CACHE BOOL "" FORCE)
+else()
+  set(FLATBUFFERS_BUILD_FLATC ON CACHE BOOL "" FORCE)
+endif()
+
 iree_select_compiler_opts(FLATBUFFERS_COPTS
   CLANG
     # Flatbuffers has a bunch of incorrect documentation annotations.
@@ -148,7 +169,9 @@
 endif()
 
 set(MLIR_TABLEGEN_EXE mlir-tblgen)
-set(IREE_TABLEGEN_EXE iree-tblgen)
+# iree-tblgen is not defined using the add_tablegen mechanism as other TableGen
+# tools in LLVM.
+iree_get_executable_path(IREE_TABLEGEN_EXE iree-tblgen)
 
 #-------------------------------------------------------------------------------
 # Third party: tensorflow

diff --git a/build_tools/cmake/iree_cross_compile.cmake b/build_tools/cmake/iree_cross_compile.cmake
new file mode 100644
index 0000000..2568abd
--- /dev/null
+++ b/build_tools/cmake/iree_cross_compile.cmake

@@ -0,0 +1,234 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include(iree_macros)
+
+# iree_create_configuration
+#
+# Creates custom commands and targets for an IREE configuration. An IREE
+# configuration means a new IREE CMake invocation with its own set of
+# parameters.
+#
+# This function defines a custom target, `iree_configure_${CONFIG_NAME}`,
+# to drive the generation of a new IREE configuration's `CMakeCache.txt`
+# file. Callers can then depend on either the `CMakeCache.txt` file or the
+# `iree_configure_${CONFIG_NAME}` target to make sure the configuration
+# is invoked as a dependency.
+#
+# This function is typically useful when cross-compiling towards another
+# architecture. For example, when cross-compiling towards Android, we need
+# to have certain tools first compiled on the host so that we can use them
+# to programmatically generate some source code to be compiled together
+# with other checked-in source code. Those host tools will be generated
+# by another CMake invocation configured by this function.
+#
+# Supported CMake options:
+# - IREE_<CONFIG_NAME>_BINARY_ROOT: the root directory for containing IREE build
+#   artifacts for the given `CONFIG_NAME`. If not specified in caller, this is
+#   set to a directory named as `CONFIG_NAME` under the current CMake binary
+#   directory.
+# - IREE_<CONFIG_NAME>_C_COMPILER: C compiler for the given `CONFIG_NAME`.
+#   This must be defined by the caller.
+# - IREE_<CONFIG_NAME>_CXX_COMPILER: C++ compiler for the given `CONFIG_NAME`.
+#   This must be defined by the caller.
+# - IREE_<CONFIG_NAME>_<option>: switch for the given `option` specifically for
+#   `CONFIG_NAME`. If missing, default to OFF for bool options; default to
+#   IREE_<option> for non-bool variables.
+function(iree_create_configuration CONFIG_NAME)
+  # Set IREE_${CONFIG_NAME}_BINARY_ROOT if missing.
+  if(NOT DEFINED IREE_${CONFIG_NAME}_BINARY_ROOT)
+    set(IREE_${CONFIG_NAME}_BINARY_ROOT "${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_NAME}")
+    set(IREE_${CONFIG_NAME}_BINARY_ROOT ${IREE_${CONFIG_NAME}_BINARY_ROOT} PARENT_SCOPE)
+    message(STATUS "Setting ${CONFIG_NAME} build directory to ${IREE_${CONFIG_NAME}_BINARY_ROOT}")
+  endif()
+
+  set(_CONFIG_BINARY_ROOT ${IREE_${CONFIG_NAME}_BINARY_ROOT})
+
+  set(_CONFIG_C_COMPILER ${IREE_${CONFIG_NAME}_C_COMPILER})
+  set(_CONFIG_CXX_COMPILER ${IREE_${CONFIG_NAME}_CXX_COMPILER})
+
+  # Check the compilers are specified in the caller.
+  if("${_CONFIG_C_COMPILER}" STREQUAL "")
+    message(FATAL_ERROR "Must define IREE_${CONFIG_NAME}_C_COMPILER for \"${CONFIG_NAME}\" configuration build")
+  endif()
+  if("${_CONFIG_CXX_COMPILER}" STREQUAL "")
+    message(FATAL_ERROR "Must define IREE_${CONFIG_NAME}_CXX_COMPILER for \"${CONFIG_NAME}\" configuration build")
+  endif()
+
+  add_custom_command(OUTPUT ${_CONFIG_BINARY_ROOT}
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${_CONFIG_BINARY_ROOT}
+    COMMENT "Creating ${_CONFIG_BINARY_ROOT}...")
+
+  # Give it a custom target so we can drive the generation manually
+  # when useful.
+  add_custom_target(iree_prepare_${CONFIG_NAME}_dir DEPENDS ${_CONFIG_BINARY_ROOT})
+
+  # LINT.IfChange(iree_cross_compile_options)
+  iree_to_bool(_CONFIG_ENABLE_RUNTIME_TRACING "${IREE_${CONFIG_NAME}_ENABLE_RUNTIME_TRACING}")
+  iree_to_bool(_CONFIG_ENABLE_MLIR "${IREE_${CONFIG_NAME}_ENABLE_MLIR}")
+  iree_to_bool(_CONFIG_ENABLE_EMITC "${IREE_${CONFIG_NAME}_ENABLE_EMITC}")
+
+  iree_to_bool(_CONFIG_BUILD_COMPILER "${IREE_${CONFIG_NAME}_BUILD_COMPILER}")
+  iree_to_bool(_CONFIG_BUILD_TESTS "${IREE_${CONFIG_NAME}_BUILD_TESTS}")
+  iree_to_bool(_CONFIG_BUILD_DOCS "${IREE_${CONFIG_NAME}_BUILD_DOCS}")
+  iree_to_bool(_CONFIG_BUILD_SAMPLES "${IREE_${CONFIG_NAME}_BUILD_SAMPLES}")
+  iree_to_bool(_CONFIG_BUILD_DEBUGGER "${IREE_${CONFIG_NAME}_BUILD_DEBUGGER}")
+  iree_to_bool(_CONFIG_BUILD_PYTHON_BINDINGS "${IREE_${CONFIG_NAME}_BUILD_PYTHON_BINDINGS}")
+  iree_to_bool(_CONFIG_BUILD_EXPERIMENTAL "${IREE_${CONFIG_NAME}_BUILD_EXPERIMENTAL}")
+
+  # Escape semicolons in the targets list so that CMake doesn't expand them to
+  # spaces.
+  string(REPLACE ";" "$<SEMICOLON>" _CONFIG_HAL_DRIVERS_TO_BUILD "${IREE_HAL_DRIVERS_TO_BUILD}")
+  string(REPLACE ";" "$<SEMICOLON>" _CONFIG_TARGET_BACKENDS_TO_BUILD "${IREE_TARGET_BACKENDS_TO_BUILD}")
+  # LINT.ThenChange(
+  #   https://github.com/google/iree/tree/master/CMakeLists.txt:iree_options,
+  #   https://github.com/google/iree/tree/master/build_tools/cmake/iree_cross_compile.cmake:iree_cross_compile_invoke
+  # )
+
+  message(STATUS "C compiler for ${CONFIG_NAME} build: ${_CONFIG_C_COMPILER}")
+  message(STATUS "C++ compiler for ${CONFIG_NAME} build: ${_CONFIG_CXX_COMPILER}")
+
+  add_custom_command(OUTPUT ${IREE_${CONFIG_NAME}_BINARY_ROOT}/CMakeCache.txt
+    COMMAND "${CMAKE_COMMAND}" "${PROJECT_SOURCE_DIR}" -G "${CMAKE_GENERATOR}"
+        -DCMAKE_MAKE_PROGRAM="${CMAKE_MAKE_PROGRAM}"
+        -DCMAKE_BUILD_TYPE="${CMAKE_BUILD_TYPE}"
+        -DCMAKE_C_COMPILER="${_CONFIG_C_COMPILER}"
+        -DCMAKE_CXX_COMPILER="${_CONFIG_CXX_COMPILER}"
+        # LINT.IfChange(iree_cross_compile_invoke)
+        -DIREE_ENABLE_RUNTIME_TRACING=${_CONFIG_ENABLE_RUNTIME_TRACING}
+        -DIREE_ENABLE_MLIR=${_CONFIG_ENABLE_MLIR}
+        -DIREE_ENABLE_EMITC=${_CONFIG_ENABLE_EMITC}
+        -DIREE_BUILD_COMPILER=${_CONFIG_BUILD_COMPILER}
+        -DIREE_BUILD_TESTS=${_CONFIG_BUILD_TESTS}
+        -DIREE_BUILD_DOCS=${_CONFIG_BUILD_DOCS}
+        -DIREE_BUILD_SAMPLES=${_CONFIG_BUILD_SAMPLES}
+        -DIREE_BUILD_DEBUGGER=${_CONFIG_BUILD_DEBUGGER}
+        -DIREE_BUILD_PYTHON_BINDINGS=${_CONFIG_BUILD_PYTHON_BINDINGS}
+        -DIREE_BUILD_EXPERIMENTAL=${_CONFIG_BUILD_EXPERIMENTAL}
+        # LINT.ThenChange(
+        #   https://github.com/google/iree/tree/master/CMakeLists.txt:iree_options,
+        #   https://github.com/google/iree/tree/master/build_tools/cmake/iree_cross_compile.cmake:iree_cross_compile_options,
+        # )
+        -DIREE_HAL_DRIVERS_TO_BUILD="${_CONFIG_HAL_DRIVERS_TO_BUILD}"
+        -DIREE_TARGET_BACKENDS_TO_BUILD="${_CONFIG_TARGET_BACKENDS_TO_BUILD}"
+    WORKING_DIRECTORY ${_CONFIG_BINARY_ROOT}
+    DEPENDS iree_prepare_${CONFIG_NAME}_dir
+    COMMENT "Configuring IREE for ${CONFIG_NAME} build...")
+
+  add_custom_target(iree_configure_${CONFIG_NAME} DEPENDS ${_CONFIG_BINARY_ROOT}/CMakeCache.txt)
+endfunction()
+
+# iree_get_build_command
+#
+# Gets the CMake build command for the given `TARGET`.
+#
+# Parameters:
+# TARGET: the target to build.
+# BINDIR: root binary directory containing CMakeCache.txt.
+# CMDVAR: variable name for receiving the build command.
+function(iree_get_build_command TARGET)
+  cmake_parse_arguments(_RULE "" "BINDIR;CMDVAR;CONFIG" "" ${ARGN})
+  if(NOT _RULE_CONFIG)
+    set(_RULE_CONFIG "$<CONFIG>")
+  endif()
+  if (CMAKE_GENERATOR MATCHES "Make")
+    # Use special command for Makefiles to support parallelism.
+    set(${_RULE_CMDVAR}
+        "$(MAKE)" "-C" "${_RULE_BINDIR}" "${TARGET}" PARENT_SCOPE)
+  else()
+    set(${_RULE_CMDVAR}
+        "${CMAKE_COMMAND}" --build ${_RULE_BINDIR}
+                           --target ${TARGET}
+                           --config ${_RULE_CONFIG} PARENT_SCOPE)
+  endif()
+endfunction()
+
+# iree_host_install
+#
+# Defines custom commands and targets for installing the given `target` under
+# host configuration. The custom target for install will be named as
+# `iree_host_install_${TARGET}`.
+#
+# Precondition:
+# iree_create_configuration(HOST) is invoked previously.
+#
+# Parameters:
+# COMPONENT: installation component; used for filtering installation targets.
+# PREFIX: the root installation path prefix.
+# DEPENDS: addtional dependencies for the installation.
+function(iree_host_install TARGET)
+  cmake_parse_arguments(_RULE "" "TARGET;COMPONENT;PREFIX" "DEPENDS" ${ARGN})
+  if(_RULE_COMPONENT)
+    set(_COMPONENT_OPTION -DCMAKE_INSTALL_COMPONENT="${_RULE_COMPONENT}")
+  endif()
+  if(_RULE_PREFIX)
+    set(_PREFIX_OPTION -DCMAKE_INSTALL_PREFIX="${_RULE_PREFIX}")
+  endif()
+
+  iree_get_executable_path(_OUTPUT_PATH ${TARGET})
+
+  add_custom_command(
+    OUTPUT ${_OUTPUT_PATH}
+    DEPENDS ${_RULE_DEPENDS}
+    COMMAND "${CMAKE_COMMAND}" ${_COMPONENT_OPTION} ${_PREFIX_OPTION}
+            -P "${IREE_HOST_BINARY_ROOT}/cmake_install.cmake"
+    USES_TERMINAL)
+
+  # Give it a custom target so we can drive the generation manually
+  # when useful.
+  add_custom_target(iree_host_install_${TARGET} DEPENDS ${_OUTPUT_PATH})
+endfunction()
+
+# iree_declare_host_excutable
+#
+# Generates custom commands and targets for building and installing a tool on
+# host for cross-compilation.
+#
+# Precondition:
+# iree_create_configuration(HOST) is invoked previously.
+#
+# Parameters:
+# TARGET: the target to build on host.
+# BUILDONLY: only generates commands for building the target.
+# DEPENDS: any additional dependencies for the target.
+function(iree_declare_host_excutable TARGET)
+  cmake_parse_arguments(_RULE "BUILDONLY" "" "DEPENDS" ${ARGN})
+
+  iree_get_executable_path(_OUTPUT_PATH ${TARGET})
+
+  iree_get_build_command(${TARGET}
+    BINDIR ${IREE_HOST_BINARY_ROOT}
+    CMDVAR build_cmd)
+
+  add_custom_target(iree_host_build_${TARGET}
+                    COMMAND ${build_cmd}
+                    DEPENDS iree_configure_HOST ${_RULE_DEPENDS}
+                    WORKING_DIRECTORY "${IREE_HOST_BINARY_ROOT}"
+                    COMMENT "Building host ${TARGET}..."
+                    USES_TERMINAL)
+
+  if(_RULE_BUILDONLY)
+    return()
+  endif()
+
+  iree_host_install(${TARGET}
+                    COMPONENT ${TARGET}
+                    PREFIX ${IREE_HOST_BINARY_ROOT}
+                    DEPENDS iree_host_build_${TARGET})
+
+  # Note that this is not enabled when BUILDONLY so we can define
+  # iree_host_${TARGET} to point to another installation path to
+  # allow flexibility.
+  add_custom_target(iree_host_${TARGET} DEPENDS "${_OUTPUT_PATH}")
+endfunction()

diff --git a/build_tools/cmake/iree_macros.cmake b/build_tools/cmake/iree_macros.cmake
index 4929146..7b27392 100644
--- a/build_tools/cmake/iree_macros.cmake
+++ b/build_tools/cmake/iree_macros.cmake

@@ -25,6 +25,22 @@
 endif()
 
 #-------------------------------------------------------------------------------
+# General utilities
+#-------------------------------------------------------------------------------
+
+# iree_to_bool
+#
+# Sets `variable` to `ON` if `value` is true and `OFF` otherwise.
+function(iree_to_bool VARIABLE VALUE)
+  if(VALUE)
+    set(${VARIABLE} "ON" PARENT_SCOPE)
+  else()
+    set(${VARIABLE} "OFF" PARENT_SCOPE)
+  endif()
+endfunction()
+
+
+#-------------------------------------------------------------------------------
 # Packages and Paths
 #-------------------------------------------------------------------------------
 
@@ -72,6 +88,28 @@
   set(${PACKAGE_DIR} ${_PACKAGE_DIR} PARENT_SCOPE)
 endfunction()
 
+# iree_get_executable_path
+#
+# Gets the path to an executable in a cross-compilation-aware way. This
+# should be used when accessing binaries that are used as part of the build,
+# such as for generating files used for later build steps.
+#
+# Paramters:
+# - OUTPUT_PATH_VAR: variable name for receiving the path to the built target.
+# - TARGET: the target to build on host.
+function(iree_get_executable_path OUTPUT_PATH_VAR TARGET)
+  if(CMAKE_CROSSCOMPILING)
+    # The target is defined in the CMake invocation for host. We don't have
+    # access to the target; relying on the path here.
+    set(_OUTPUT_PATH "${IREE_HOST_BINARY_ROOT}/bin/${TARGET}")
+    set(${OUTPUT_PATH_VAR} "${_OUTPUT_PATH}" PARENT_SCOPE)
+  else()
+    # The target is defined in this CMake invocation. We can query the location
+    # directly from CMake.
+    set(${OUTPUT_PATH_VAR} "$<TARGET_FILE:${TARGET}>" PARENT_SCOPE)
+  endif()
+endfunction()
+
 #-------------------------------------------------------------------------------
 # select()-like Evaluation
 #-------------------------------------------------------------------------------
@@ -169,3 +207,20 @@
     endif()
   endforeach()
 endfunction()
+
+# iree_add_executable_dependencies
+#
+# Adds dependency on a target in a cross-compilation-aware way. This should
+# be used for depending on targets that are used as part of the build, such
+# as for generating files used for later build steps.
+#
+# Parameters:
+# TARGET: the target to take on dependencies
+# DEPENDENCY: additional dependencies to append to target
+function(iree_add_executable_dependencies TARGET DEPENDENCY)
+  if(CMAKE_CROSSCOMPILING)
+    add_dependencies(${TARGET} iree_host_${DEPENDENCY})
+  else()
+    add_dependencies(${TARGET} ${DEPENDENCY})
+  endif()
+endfunction()

diff --git a/build_tools/embed_data/CMakeLists.txt b/build_tools/embed_data/CMakeLists.txt
index ec07934..4efad40 100644
--- a/build_tools/embed_data/CMakeLists.txt
+++ b/build_tools/embed_data/CMakeLists.txt

@@ -12,13 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-add_executable(generate_cc_embed_data)
-target_sources(generate_cc_embed_data PRIVATE generate_cc_embed_data.cc)
-set_target_properties(generate_cc_embed_data PROPERTIES OUTPUT_NAME generate_cc_embed_data)
+if(CMAKE_CROSSCOMPILING)
+  iree_declare_host_excutable(generate_cc_embed_data)
+else()
+  add_executable(generate_cc_embed_data)
+  target_sources(generate_cc_embed_data PRIVATE generate_cc_embed_data.cc)
+  set_target_properties(generate_cc_embed_data PROPERTIES OUTPUT_NAME generate_cc_embed_data)
 
-target_link_libraries(generate_cc_embed_data
-  absl::flags
-  absl::flags_parse
-  absl::strings
-  absl::time
-)
+  target_link_libraries(generate_cc_embed_data
+    absl::flags
+    absl::flags_parse
+    absl::strings
+    absl::time
+  )
+  install(TARGETS generate_cc_embed_data
+          COMPONENT generate_cc_embed_data
+          RUNTIME DESTINATION bin)
+endif()

diff --git a/docs/repository_management.md b/docs/repository_management.md
index 3cd2618..1d6a342 100644
--- a/docs/repository_management.md
+++ b/docs/repository_management.md

@@ -7,6 +7,13 @@
 transparency. If any of these things are particularly troublesome or painful for
 your workflow, please reach out to us so we can prioritize a fix.
 
+NOTE: We are currently in the process of migrating our repository to be
+GitHub-first and hide the merging complexity in a separate `google` feature
+branch so that standard development workflows don't have to bear the cost for
+every contribution. During this part of the migration period, please direct PRs
+to the `google` branch (which will be marked as the default branch). See
+https://groups.google.com/d/msg/iree-discuss/F07vsG9Ah4o/uAIusKO-BQAJ.
+
 ## Dependencies
 
 As a project which brings together compiler, runtime and graphics systems,
@@ -141,9 +148,6 @@
 this prior to running just to make sure that their git view of the submodule
 state is consistent.
 
-TODO(laurenzo): Add a GitHub hook to auto-commit submodule updates on
-`SUBMODULE_VERSIONS` file changes.
-
 #### Updating TensorFlow and LLVM versions
 
 WARNING: These scripts have not been updated to reflect the new tooling to

diff --git a/experimental/ModelBuilder/test/BUILD b/experimental/ModelBuilder/test/BUILD
index 39b6475..7f6d34d 100644
--- a/experimental/ModelBuilder/test/BUILD
+++ b/experimental/ModelBuilder/test/BUILD

@@ -165,6 +165,7 @@
         "//experimental/ModelBuilder:ModelRunner",
         "//experimental/ModelBuilder:VulkanLaunchWrapper",
         "//iree/base:initializer",
+        "//iree/compiler/Conversion/CodegenUtils",
         "//iree/compiler/Conversion/LinalgToSPIRV",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AllPassesAndDialects",

diff --git a/experimental/ModelBuilder/test/TestVectorToGPU.cpp b/experimental/ModelBuilder/test/TestVectorToGPU.cpp
index 547e825..72902ef 100644
--- a/experimental/ModelBuilder/test/TestVectorToGPU.cpp
+++ b/experimental/ModelBuilder/test/TestVectorToGPU.cpp

@@ -47,7 +47,7 @@
 #include "mlir/Dialect/SPIRV/SPIRVOps.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/Passes.h"
-#include "iree/compiler/Conversion/LinalgToSPIRV/MarkerUtils.h"
+#include "iree/compiler/Conversion/CodegenUtils/MarkerUtils.h"
 
 using namespace mlir;                    // NOLINT
 using namespace mlir::edsc;              // NOLINT

diff --git a/integrations/tensorflow/bindings/python/pyiree/tf/compiler/BUILD b/integrations/tensorflow/bindings/python/pyiree/tf/compiler/BUILD
index d23a095..1f980ae 100644
--- a/integrations/tensorflow/bindings/python/pyiree/tf/compiler/BUILD
+++ b/integrations/tensorflow/bindings/python/pyiree/tf/compiler/BUILD

@@ -108,6 +108,7 @@
         "@llvm-project//mlir:IR",
         "@org_tensorflow//tensorflow/cc/saved_model:loader_lite",
         "@org_tensorflow//tensorflow/compiler/mlir/tensorflow:convert_graphdef",
+        "@org_tensorflow//tensorflow/compiler/mlir/tensorflow:tf_saved_model_passes",
         "@org_tensorflow//tensorflow/core:core_cpu",
     ],
 )

diff --git a/integrations/tensorflow/bindings/python/pyiree/tf/compiler/register_tensorflow.cc b/integrations/tensorflow/bindings/python/pyiree/tf/compiler/register_tensorflow.cc
index c9a4da4..1dcc0c2 100644
--- a/integrations/tensorflow/bindings/python/pyiree/tf/compiler/register_tensorflow.cc
+++ b/integrations/tensorflow/bindings/python/pyiree/tf/compiler/register_tensorflow.cc

@@ -63,7 +63,7 @@
                               absl::MakeSpan(mutable_exported_names));
   if (!module_or.status().ok()) {
     std::stringstream msg;
-    msg << "Failed to convert saved model to MLIR'" << saved_model_dir
+    msg << "Failed to convert saved model to MLIR '" << saved_model_dir
         << "': " << module_or.status();
     throw RaisePyError(PyExc_RuntimeError, msg.str().c_str());
   }
@@ -93,7 +93,7 @@
                                 context_bundle->mlir_context());
   if (!module_or.status().ok()) {
     std::stringstream msg;
-    msg << "Failed to convert saved model to MLIR'" << saved_model_dir
+    msg << "Failed to convert saved model to MLIR '" << saved_model_dir
         << "': " << module_or.status();
     throw RaisePyError(PyExc_RuntimeError, msg.str().c_str());
   }

diff --git a/integrations/tensorflow/bindings/python/pyiree/tf/support/BUILD b/integrations/tensorflow/bindings/python/pyiree/tf/support/BUILD
index a536513..c211cf0 100644
--- a/integrations/tensorflow/bindings/python/pyiree/tf/support/BUILD
+++ b/integrations/tensorflow/bindings/python/pyiree/tf/support/BUILD

@@ -16,6 +16,7 @@
     "//bindings/python:build_defs.oss.bzl",
     "INTREE_TENSORFLOW_PY_DEPS",
     "iree_py_library",
+    "iree_py_test",
 )
 
 package(
@@ -35,3 +36,15 @@
         "//bindings/python/pyiree/rt",
     ],
 )
+
+iree_py_test(
+    name = "tf_test_utils_test",
+    srcs = [
+        "tf_test_utils.py",
+        "tf_test_utils_test.py",
+    ],
+    python_version = "PY3",
+    deps = INTREE_TENSORFLOW_PY_DEPS + [
+        "//integrations/tensorflow/bindings/python/pyiree/tf/support",
+    ],
+)

diff --git a/integrations/tensorflow/bindings/python/pyiree/tf/support/tf_test_utils.py b/integrations/tensorflow/bindings/python/pyiree/tf/support/tf_test_utils.py
index 37da253..963ee4b 100644
--- a/integrations/tensorflow/bindings/python/pyiree/tf/support/tf_test_utils.py
+++ b/integrations/tensorflow/bindings/python/pyiree/tf/support/tf_test_utils.py

@@ -21,6 +21,7 @@
 
 import collections
 import os
+import random
 import re
 import tempfile
 
@@ -29,7 +30,6 @@
 import numpy as np
 from pyiree import rt
 from pyiree.tf import compiler
-import random
 import tensorflow.compat.v2 as tf
 
 flags.DEFINE_string(
@@ -83,11 +83,11 @@
         pass_pipeline=())
 
     # Save the input MLIR module.
-    flattened_target_backends = re.sub("[^0-9a-zA-Z]+", "_",
+    flattened_target_backends = re.sub("[^0-9a-zA-Z_]+", "_",
                                        "__".join(target_backends))
     if global_debug_dir:
       mlir_path = os.path.join(global_debug_dir,
-                               "raw_%s.mlir" % flattened_target_backends)
+                               "raw__%s.mlir" % flattened_target_backends)
       logging.info("Saving raw TF input MLIR to: %s", mlir_path)
       with open(mlir_path, "w") as f:
         f.write(compiler_module.to_asm())
@@ -97,7 +97,7 @@
 
     if global_debug_dir:
       mlir_path = os.path.join(global_debug_dir,
-                               "input_%s.mlir" % flattened_target_backends)
+                               "input__%s.mlir" % flattened_target_backends)
       logging.info("Saving IREE input MLIR to: %s", mlir_path)
       with open(mlir_path, "w") as f:
         f.write(compiler_module.to_asm())
@@ -105,7 +105,7 @@
     compiled_module = compiler_module.compile(target_backends=target_backends)
     if global_debug_dir:
       compiled_path = os.path.join(
-          global_debug_dir, "compiled_%s.vmfb" % flattened_target_backends)
+          global_debug_dir, "compiled__%s.vmfb" % flattened_target_backends)
       logging.info("Saving compiled IREE module to: %s", compiled_path)
       with open(compiled_path, "wb") as f:
         f.write(compiled_module)
@@ -315,6 +315,68 @@
     return _make_multi_result_class(results_tuple_class)(*all_results.values())
 
 
+def _recursive_check_same(result_ref, result_tgt, rtol=1e-6, atol=1e-6):
+  same = True
+  if not isinstance(result_tgt, type(result_ref)):
+    raise ValueError("Types of the outputs must be the same, but have '{}' and "
+                     "'{}'".format(type(result_ref), type(result_tgt)))
+  if isinstance(result_ref, dict):
+    if result_ref.keys() != result_tgt.keys():
+      raise ValueError("Outputs must have the same structure, but have '{}' and"
+                       " '{}'".format(result_ref.keys(), result_tgt.keys()))
+    for key in result_ref.keys():
+      same = same and _recursive_check_same(result_ref[key], result_tgt[key],
+                                            rtol, atol)
+      if not same:
+        return False  # no need to go further they are different
+  elif isinstance(result_ref, list):
+    if len(result_ref) != len(result_tgt):
+      raise ValueError("Outputs must have the same structure, but have '{}' and"
+                       " '{}'".format(result_ref, result_tgt))
+    for i in range(len(result_ref)):
+      same = same and _recursive_check_same(result_ref[i], result_tgt[i], rtol,
+                                            atol)
+      if not same:
+        return False  # no need to go further they are different
+  elif isinstance(result_ref, np.ndarray):
+    if isinstance(result_ref.flat[0], np.floating):
+      return np.allclose(result_ref, result_tgt, rtol=rtol, atol=atol)
+    else:
+      return np.array_equal(result_ref, result_tgt)
+  else:
+    # this one need more checks
+    return result_ref == result_tgt
+  return same
+
+
+def _collect_disagreements_recursively(mr, rtol=1e-6, atol=1e-6):
+  """Compare result structs recursively and search for disagreements.
+
+  Args:
+    mr: A MultiResults namedtuple where each entry corresponds to a backend set
+      of results.
+    rtol: The relative tolerance parameter.
+    atol: The absolute tolerance parameter.
+
+  Returns:
+    An equivalent MultiResults where each entry is an array of result names
+    that disagree.
+  """
+  has_disagreement = False
+  disagreement_list = [list() for _ in mr]
+  for i in range(len(mr)):
+    result_ref = mr[i]
+    for j in range(len(mr)):
+      if i < j:
+        continue  # Don't check self and reverse comparisons
+      result_tgt = mr[j]
+      if not _recursive_check_same(result_ref, result_tgt, rtol, atol):
+        has_disagreement = True
+        disagreement_list[i].append(mr._fields[j])
+  disagreements_tuple = collections.namedtuple("Disagreements", mr._fields)
+  return has_disagreement, disagreements_tuple(*disagreement_list)
+
+
 def _collect_disagreements(mr, predicate):
   """Verifies that result structs.
 
@@ -363,10 +425,31 @@
                                     (disagreements, self))
       return self
 
+    def assert_all_close_and_equal(self, rtol=1e-6, atol=1e-6):
+      # it is a special case when output can be a nestet map of dict(), list()
+      # with different types: int, float, string
+      # in this case int and string must be equal and for float we use rtol,atol
+      has_disagreement, disagreements = _collect_disagreements_recursively(
+          self, rtol, atol)
+      assert not has_disagreement, ("Multiple backends disagree (%r):\n%r" %
+                                    (disagreements, self))
+      return self
+
     def print(self):
       print(self)
       return self
 
+    def save(self):
+      if FLAGS.debug_dir:
+        for i in range(len(self)):
+          result = self[i]  # output generated by a model
+          field = self._fields[i]  # backend name
+          fname = os.path.join(FLAGS.debug_dir, "output_{}".format(field))
+          with open(fname, "w") as file:
+            # content of txt file can be converted to py objects by eval(txt)
+            file.write(str(result))
+      return self
+
   return MultiResults
 
 

diff --git a/integrations/tensorflow/bindings/python/pyiree/tf/support/tf_test_utils_test.py b/integrations/tensorflow/bindings/python/pyiree/tf/support/tf_test_utils_test.py
new file mode 100644
index 0000000..20ba522
--- /dev/null
+++ b/integrations/tensorflow/bindings/python/pyiree/tf/support/tf_test_utils_test.py

@@ -0,0 +1,83 @@
+# Lint as: python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for pyiree.tf.support.tf_test_utils."""
+
+from absl.testing import parameterized
+import numpy as np
+from pyiree.tf.support import tf_test_utils
+import tensorflow as tf
+
+
+class UtilsTests(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters([
+      {
+          'testcase_name': 'all the same',
+          'array_c': np.array([0, 1, 2]),
+          'array_d': np.array(['0', '1', '2']),
+          'array_e': np.array([0.0, 0.1, 0.2]),
+          'tgt_same': True,
+      },
+      {
+          'testcase_name': 'wrong int',
+          'array_c': np.array([1, 1, 2]),
+          'array_d': np.array(['0', '1', '2']),
+          'array_e': np.array([0.0, 0.1, 0.2]),
+          'tgt_same': False,
+      },
+      {
+          'testcase_name': 'wrong string',
+          'array_c': np.array([0, 1, 2]),
+          'array_d': np.array(['a', '1', '2']),
+          'array_e': np.array([0.0, 0.1, 0.2]),
+          'tgt_same': False,
+      },
+      {
+          'testcase_name': 'wrong float',
+          'array_c': np.array([0, 1, 2]),
+          'array_d': np.array(['0', '1', '2']),
+          'array_e': np.array([1.0, 0.1, 0.2]),
+          'tgt_same': False,
+      },
+  ])
+  def test_recursive_check_same(self, array_c, array_d, array_e, tgt_same):
+
+    ref = {
+        'a':
+            1,
+        'b': [{
+            'c': np.array([0, 1, 2])
+        }, {
+            'd': np.array(['0', '1', '2'])
+        }, {
+            'e': np.array([0.0, 0.1, 0.2])
+        }],
+    }
+    tgt = {
+        'a': 1,
+        'b': [{
+            'c': array_c
+        }, {
+            'd': array_d
+        }, {
+            'e': array_e
+        }],
+    }
+    same = tf_test_utils._recursive_check_same(ref, tgt)
+    self.assertEqual(tgt_same, same)
+
+
+if __name__ == '__main__':
+  tf.test.main()

diff --git a/integrations/tensorflow/e2e/BUILD b/integrations/tensorflow/e2e/BUILD
index f25c6b1..1dbecf7 100644
--- a/integrations/tensorflow/e2e/BUILD
+++ b/integrations/tensorflow/e2e/BUILD

@@ -28,13 +28,29 @@
     licenses = ["notice"],  # Apache 2.0
 )
 
+# Create binaries for all test srcs to allow them to be run manually.
+[
+    py_binary(
+        name = src.replace(".py", "_manual"),
+        srcs = [src],
+        main = src,
+        python_version = "PY3",
+        deps = INTREE_TENSORFLOW_PY_DEPS + NUMPY_DEPS + [
+            "//integrations/tensorflow/bindings/python/pyiree/tf/support",
+        ],
+    )
+    for src in glob(["*_test.py"])
+]
+
 # Special cases to exclude from automatically expanding targets for all
 # backends.
+# keep sorted
 SPECIAL_CASES = [
     "explicit_backend_test.py",
     "linspace_test.py",
 ]
 
+# keep sorted
 VMLA_FAILING = [
     "fill_test.py",
     "mandelbrot_test.py",
@@ -42,6 +58,7 @@
     "strings_test.py",
 ]
 
+# keep sorted
 LLVM_FAILING = [
     "broadcasting_test.py",
     "depth_conv_test.py",
@@ -56,6 +73,7 @@
     "strings_test.py",
 ]
 
+# keep sorted
 VULKAN_FAILING = [
     "broadcasting_test.py",
     "depth_conv_test.py",
@@ -92,7 +110,7 @@
 )
 
 iree_e2e_test_suite(
-    name = "e2e",
+    name = "e2e_tests",
     backends_to_srcs = {
         "tf_also": TF_PASSING,
         "iree_vmla": VMLA_PASSING,
@@ -106,7 +124,7 @@
 )
 
 iree_e2e_test_suite(
-    name = "e2e_failing",
+    name = "e2e_tests_failing",
     backends_to_srcs = {
         "iree_vmla": VMLA_FAILING,
         "iree_llvmjit": LLVM_FAILING,
@@ -125,10 +143,11 @@
 
 # Special cases.
 
-# linspace_test passes internally, but fails in the OSS CI.
+# linspace_test passes internally, but fails in the OSS CI, so it needs
+# a "nokokoro" tag.
 iree_e2e_test_suite(
     # TODO(#2082): `linspace_test.py` fails in the `bazel-tensorflow` image.
-    name = "linspace",
+    name = "linspace_tests",
     backends_to_srcs = {
         "tf_also": ["linspace_test.py"],
         "iree_vmla": ["linspace_test.py"],
@@ -143,7 +162,7 @@
 )
 
 iree_e2e_test_suite(
-    name = "linspace_failing",
+    name = "linspace_tests_failing",
     backends_to_srcs = {
         "iree_llvmjit": ["linspace_test.py"],
         "iree_vulkan": ["linspace_test.py"],
@@ -159,7 +178,8 @@
     ],
 )
 
-# This tests explicitly writing which backends to use in Python.
+# This tests explicitly writing which backends to use in Python,
+# so overriding the backends can cause it to break.
 iree_py_test(
     name = "explicit_backend_test",
     srcs = ["explicit_backend_test.py"],

diff --git a/integrations/tensorflow/e2e/README.md b/integrations/tensorflow/e2e/README.md
index 2f3d331..36e5e71 100644
--- a/integrations/tensorflow/e2e/README.md
+++ b/integrations/tensorflow/e2e/README.md

@@ -22,38 +22,67 @@
 
 ## Running tests
 
-NOTE: We are in the process of reworking how backend specification functions, so
-you have to specify the target name including the name of the test suite and
-using a specific backend pair even if you are overriding the backends. The
-override backends take precedence.
-
 ```shell
 # For locally running tests and iterating on backend development,
 # `bazel run` is preferred.
-bazel run :e2e_math_test_tf_tf_also -- --override_backends=iree_vulkan
+bazel run :math_test_manual -- --override_backends=iree_vmla
 
 # Same as above, but add `tf` backend to cross-check numerical correctness.
-bazel run :e2e_math_test_tf_tf_also -- --override_backends=tf,iree_vulkan
+bazel run :math_test_manual -- --override_backends=tf,iree_vmla
 
 # Run all tests with defaults and output on failure.
 bazel test ... --test_output=errors
 
 # Run an individual test interactively.
-bazel test simple_arithmetic_test --test_output=streamed
-
-# Run tests with an altered list of backends.
-bazel test ... --test_output=errors \
-    --test_arg=--override_backends=tf,iree_vmla,iree_vulkan
+bazel run :math_test_manual -- --test_output=streamed
 ```
 
 If you specify the same backend multiple times, for example
---override_backends=iree_vmla,iree_vmla. The same backends are grouped and in
-this example iree_vmla will run once. If you specify tf,iree_vmla as backends,
-then we will test both backends and compare them with each other. If you specify
-tf backend only, then we will also test tf vs tf to capture any model
-initialization/randomization issues (it is a special case for debug purpose).
-For reproducibility of the unit tests we set random seed of tf and numpy by
-calling tf_test_utils.set_random_seed() before model creation.
+`--override_backends=iree_vmla,iree_vmla`. The same backends are grouped and in
+this example `iree_vmla` will run once. If you specify `tf,iree_vmla` as
+backends, then we will test both backends and compare them with each other. If
+you specify `tf` backend only, then we will also test `tf` vs `tf` to capture
+any model initialization/randomization issues (it is a special case for debug
+purpose). For reproducibility of the unit tests we set random seed of `tf` and
+`numpy` by calling `tf_test_utils.set_random_seed()` before model creation.
+
+## Test Suites
+
+Test targets are automatically generated for each test file and for each backend
+to check numerical correctness against TensorFlow. Tests targets that pass are
+placed into the `e2e_tests` test suite. Tests that fail on particular backends
+are recorded in lists in the `BUILD` files. For example, if
+`experimental_new_test.py` fails on the `iree_llvmjit` and `iree_vulkan`
+backends then the following lines should be added to the `BUILD` file:
+
+```build
+LLVM_FAILING = [
+    ...
+    "experimental_new_test.py",
+    ...
+]
+
+VULKAN_FAILING = [
+    ...
+    "experimental_new_test.py",
+    ...
+]
+```
+
+Test targets for these backends are placed into the `e2e_tests_failing` test
+suite. Test targets in these test suites can be run as follows:
+
+```shell
+# Run all e2e tests that are expected to pass.
+bazel test :e2e_tests
+
+# Run all e2e tests that are expected to fail.
+bazel test :e2e_tests_failing
+
+# Run a specific failing e2e test target.
+# Note that generated test targets are prefixed with their test suite name.
+bazel test :e2e_tests_failing_broadcasting_test__tf__iree_vulkan
+```
 
 ## Debugging tests
 
@@ -74,15 +103,7 @@
 ### Limiting a test to only certain backends
 
 The BUILD file specifies which targets work on which backends and controls which
-backends tests are run on by using the `--override_backends` flag. If you add a
-new test that does not work on some backends, list it as failing on those
-backends in the BUILD file.
-
-```build
-VULKAN_FAILING = [
-    "my_experimental_new_test.py",
-]
-```
+backends tests are run on by using the `--override_backends` flag.
 
 The `@tf_test_utils.compile_modules` decorator on tests also takes a `backends=`
 keyword argument. Many tests still specify this, but it is ignored in the CI,

diff --git a/integrations/tensorflow/e2e/iree_e2e_test_suite.bzl b/integrations/tensorflow/e2e/iree_e2e_test_suite.bzl
index e9efa4c..7b19938 100644
--- a/integrations/tensorflow/e2e/iree_e2e_test_suite.bzl
+++ b/integrations/tensorflow/e2e/iree_e2e_test_suite.bzl

@@ -48,7 +48,7 @@
 
     for backend, srcs in backends_to_srcs.items():
         for src in srcs:
-            test_name = "{}_{}_{}_{}".format(
+            test_name = "{}_{}__{}__{}".format(
                 name,
                 src[:-3],
                 reference_backend,

diff --git a/integrations/tensorflow/e2e/keras/BUILD b/integrations/tensorflow/e2e/keras/BUILD
index 6bff6e4..84d617b 100644
--- a/integrations/tensorflow/e2e/keras/BUILD
+++ b/integrations/tensorflow/e2e/keras/BUILD

@@ -16,7 +16,6 @@
     "//bindings/python:build_defs.oss.bzl",
     "INTREE_TENSORFLOW_PY_DEPS",
     "NUMPY_DEPS",
-    "iree_py_test",
 )
 load(
     "//integrations/tensorflow/e2e/keras:iree_vision_test_suite.bzl",
@@ -32,6 +31,47 @@
     licenses = ["notice"],  # Apache 2.0
 )
 
+# @unused
+DOC = """
+vision_model_test_manual is for manual testing of all keras vision models.
+Test will run only manually with all parameters specified manually, for example:
+bazel run -c opt integrations/tensorflow/e2e/keras:vision_model_test_manual -- \
+--override_backends=tf,iree_vmla,iree_llvmjit \
+--data=imagenet \
+--include_top=1 \
+--url=https://storage.googleapis.com/iree_models/ \
+--model=ResNet50
+
+Command arguments description:
+--override_backends: can be combination of these: tf,iree_vmla,iree_llvmjit
+--data: can be 'imagenet' or 'cifar10'.
+    imagenet - input image size (1, 224, 224, 3)
+    cifar10 - input image size (1, 32, 32, 3) - it is used for quick tests
+            and needs pretrained weights, we pretrained models: ResNet50, MobileNet, MobileNetV2
+--include_top: can be 1 or 0. Include top layer 1, not include top layer 0
+--url: we need it only for cifar10 models to load weights from https://storage.googleapis.com/iree_models/
+       imagenet pretrained weights url is specified by keras
+--model: supports ResNet50, MobileNet, MobileNetV2, ResNet101, ResNet152,
+    ResNet50V2, ResNet101V2, ResNet152V2, VGG16, VGG19, Xception,
+    InceptionV3, InceptionResNetV2, DenseNet121, DenseNet169,
+    DenseNet201, NASNetMobile, NASNetLarge
+    All above models works with 'imagenet' data sets.
+    ResNet50, MobileNet, MobileNetV2 work with both 'imagenet' and 'cifar10' data sets.
+"""
+
+[
+    py_binary(
+        name = src.replace(".py", "_manual"),
+        srcs = [src],
+        main = src,
+        python_version = "PY3",
+        deps = INTREE_TENSORFLOW_PY_DEPS + NUMPY_DEPS + [
+            "//integrations/tensorflow/bindings/python/pyiree/tf/support",
+        ],
+    )
+    for src in glob(["*_test.py"])
+]
+
 SPECIAL_CASES = [
     "vision_model_test.py",
 ]
@@ -71,7 +111,7 @@
 )
 
 iree_e2e_test_suite(
-    name = "keras",
+    name = "keras_tests",
     backends_to_srcs = {
         "tf_also": TF_PASSING,
         "iree_vmla": VMLA_PASSING,
@@ -85,7 +125,7 @@
 )
 
 iree_e2e_test_suite(
-    name = "keras_failing",
+    name = "keras_tests_failing",
     backends_to_srcs = {
         "iree_vmla": VMLA_FAILING,
         "iree_llvmjit": LLVM_FAILING,
@@ -102,52 +142,8 @@
     ],
 )
 
-# @unused
-DOC = """
-vision_models_test is for manual testing of all keras vision models.
-Test will run only manually with all parameters specified manually, for example:
-bazel run -c opt integrations/tensorflow/e2e/keras/vision_models_test -- \
---override_backends=tf,iree_vmla,iree_llvmjit \
---data=imagenet \
---include_top=1 \
---url=https://storage.googleapis.com/iree_models/ \
---model=ResNet50
-
-Command arguments description:
---override_backends: can be combination of these: tf,iree_vmla,iree_llvmjit
---data: can be 'imagenet' or 'cifar10'.
-    imagenet - input image size (1, 224, 224, 3)
-    cifar10 - input image size (1, 32, 32, 3) - it is used for quick tests
-            and needs pretrained weights, we pretrained models: ResNet50, MobileNet, MobileNetV2
---include_top: can be 1 or 0. Include top layer 1, not include top layer 0
---url: we need it only for cifar10 models to load weights from https://storage.googleapis.com/iree_models/
-       imagenet pretrained weights url is specified by keras
---model: supports ResNet50, MobileNet, MobileNetV2, ResNet101, ResNet152,
-    ResNet50V2, ResNet101V2, ResNet152V2, VGG16, VGG19, Xception,
-    InceptionV3, InceptionResNetV2, DenseNet121, DenseNet169,
-    DenseNet201, NASNetMobile, NASNetLarge
-    All above models works with 'imagenet' data sets.
-    ResNet50, MobileNet, MobileNetV2 work with both 'imagenet' and 'cifar10' data sets.
-"""
-
-iree_py_test(
-    name = "vision_models_test",
-    srcs = ["vision_model_test.py"],
-    main = "vision_model_test.py",
-    python_version = "PY3",
-    tags = [
-        "external",
-        "manual",
-        "nokokoro",
-        "notap",
-    ],
-    deps = INTREE_TENSORFLOW_PY_DEPS + NUMPY_DEPS + [
-        "//integrations/tensorflow/bindings/python/pyiree/tf/support",
-    ],
-)
-
 iree_vision_test_suite(
-    name = "vision_models",
+    name = "vision_internal_tests",
     datasets = ["cifar10"],
     models_to_backends = {
         "ResNet50": [
@@ -165,7 +161,7 @@
 )
 
 iree_vision_test_suite(
-    name = "vision_models_external",
+    name = "vision_external_tests",
     datasets = [
         "cifar10",
         "imagenet",
@@ -197,9 +193,8 @@
 )
 
 iree_vision_test_suite(
-    # TODO: Combine this suite with keras_vision_models_external once these
-    # tests pass.
-    name = "vision_models_external_failing",
+    # TODO: Combine this suite with vision_external_tests once these tests pass.
+    name = "vision_external_tests_failing",
     datasets = [
         "cifar10",
         "imagenet",

diff --git a/integrations/tensorflow/e2e/keras/iree_vision_test_suite.bzl b/integrations/tensorflow/e2e/keras/iree_vision_test_suite.bzl
index b2609fb..bd6aae2 100644
--- a/integrations/tensorflow/e2e/keras/iree_vision_test_suite.bzl
+++ b/integrations/tensorflow/e2e/keras/iree_vision_test_suite.bzl

@@ -67,11 +67,11 @@
             for backend in backends:
                 for dataset in datasets:
                     test_backends = [reference_backend, backend]
-                    test_name = "{}_{}_{}_{}".format(
+                    test_name = "{}_{}_{}__{}".format(
                         name,
                         model,
                         dataset,
-                        "_".join(test_backends),
+                        "__".join(test_backends),
                     )
                     tests.append(test_name)
 

diff --git a/integrations/tensorflow/e2e/keras/train/BUILD b/integrations/tensorflow/e2e/keras/train/BUILD
index 534ac3e..1160ab2 100644
--- a/integrations/tensorflow/e2e/keras/train/BUILD
+++ b/integrations/tensorflow/e2e/keras/train/BUILD

@@ -27,8 +27,9 @@
     licenses = ["notice"],  # Apache 2.0
 )
 
+# TODO(meadowlark): Refactor this rule to match iree_vision_test_suite.bzl
 iree_train_test_suite(
-    name = "train",
+    name = "train_tests",
     configurations = [
         # tuples of (optimizer, backends)
         ("sgd", "tf"),
@@ -45,7 +46,7 @@
 )
 
 iree_train_test_suite(
-    name = "train_failing",
+    name = "train_tests_failing",
     configurations = [
         # tuples of (optimizer, backends)
         # TODO: Combine this suite with keras_model_train once these tests pass.

diff --git a/integrations/tensorflow/e2e/keras/vision_model_test.py b/integrations/tensorflow/e2e/keras/vision_model_test.py
index 0a1e812..9164d9b 100644
--- a/integrations/tensorflow/e2e/keras/vision_model_test.py
+++ b/integrations/tensorflow/e2e/keras/vision_model_test.py

@@ -136,7 +136,7 @@
         np.float32)
     input_data = input_data.reshape(input_shape)
     self.modules.applications.all.predict(input_data).print().assert_all_close(
-        atol=3e-5)
+        atol=1e-6)
 
 
 if __name__ == '__main__':

diff --git a/iree/compiler/Conversion/CodegenUtils/BUILD b/iree/compiler/Conversion/CodegenUtils/BUILD
index 09daa26..ff2ef35 100644
--- a/iree/compiler/Conversion/CodegenUtils/BUILD
+++ b/iree/compiler/Conversion/CodegenUtils/BUILD

@@ -23,13 +23,16 @@
     name = "CodegenUtils",
     srcs = [
         "FunctionUtils.cpp",
+        "MarkerUtils.cpp",
     ],
     hdrs = [
         "FunctionUtils.h",
+        "MarkerUtils.h",
     ],
     deps = [
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LinalgTransforms",
         "@llvm-project//mlir:Support",
     ],
 )

diff --git a/iree/compiler/Conversion/CodegenUtils/CMakeLists.txt b/iree/compiler/Conversion/CodegenUtils/CMakeLists.txt
index 8f00fc1..716b94b 100644
--- a/iree/compiler/Conversion/CodegenUtils/CMakeLists.txt
+++ b/iree/compiler/Conversion/CodegenUtils/CMakeLists.txt

@@ -19,11 +19,14 @@
     CodegenUtils
   HDRS
     "FunctionUtils.h"
+    "MarkerUtils.h"
   SRCS
     "FunctionUtils.cpp"
+    "MarkerUtils.cpp"
   DEPS
     LLVMSupport
     MLIRIR
+    MLIRLinalgTransforms
     MLIRSupport
   PUBLIC
 )

diff --git a/iree/compiler/Conversion/LinalgToSPIRV/MarkerUtils.cpp b/iree/compiler/Conversion/CodegenUtils/MarkerUtils.cpp
similarity index 89%
rename from iree/compiler/Conversion/LinalgToSPIRV/MarkerUtils.cpp
rename to iree/compiler/Conversion/CodegenUtils/MarkerUtils.cpp
index 251a686..cf641e1 100644
--- a/iree/compiler/Conversion/LinalgToSPIRV/MarkerUtils.cpp
+++ b/iree/compiler/Conversion/CodegenUtils/MarkerUtils.cpp

@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "iree/compiler/Conversion/LinalgToSPIRV/MarkerUtils.h"
+#include "iree/compiler/Conversion/CodegenUtils/MarkerUtils.h"
 
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/IR/Attributes.h"
@@ -34,6 +34,8 @@
   return attr && (marker == "" || attr.getValue() == marker);
 }
 
+StringRef getNoTileMarker() { return "no-tile"; }
+
 StringRef getWorkGroupMarker() { return "workgroup"; }
 
 StringRef getWorkItemMarker() { return "workitem"; }
@@ -44,6 +46,10 @@
   return checkMarkerValue(op, marker);
 }
 
+bool hasNoTileMarker(Operation *op) {
+  return checkMarkerValue(op, getNoTileMarker());
+}
+
 bool hasWorkGroupMarker(Operation *op) {
   return checkMarkerValue(op, getWorkGroupMarker());
 }
@@ -63,6 +69,8 @@
               StringAttr::get(marker, op->getContext()));
 }
 
+void setNoTileMarker(Operation *op) { setMarker(op, getNoTileMarker()); }
+
 void setCooperativeMatrixMarker(Operation *op) {
   op->setAttr(VectorTransforms::kVectorTransformMarker,
               StringAttr::get(getCooperativeMatrixMarker(), op->getContext()));

diff --git a/iree/compiler/Conversion/LinalgToSPIRV/MarkerUtils.h b/iree/compiler/Conversion/CodegenUtils/MarkerUtils.h
similarity index 70%
rename from iree/compiler/Conversion/LinalgToSPIRV/MarkerUtils.h
rename to iree/compiler/Conversion/CodegenUtils/MarkerUtils.h
index 633bca0..fa14263 100644
--- a/iree/compiler/Conversion/LinalgToSPIRV/MarkerUtils.h
+++ b/iree/compiler/Conversion/CodegenUtils/MarkerUtils.h

@@ -19,8 +19,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef IREE_COMPILER_CONVERSION_LINALGTOSPIRV_MARKERUTILS_H_
-#define IREE_COMPILER_CONVERSION_LINALGTOSPIRV_MARKERUTILS_H_
+#ifndef IREE_COMPILER_CONVERSION_CODEGENUTILS_MARKERUTILS_H_
+#define IREE_COMPILER_CONVERSION_CODEGENUTILS_MARKERUTILS_H_
 
 #include "llvm/ADT/StringRef.h"
 #include "mlir/Support/LLVM.h"
@@ -30,34 +30,55 @@
 class Operation;
 namespace iree_compiler {
 
+/// Marker to denote that do not tile the linalg operation.
+StringRef getNoTileMarker();
+
+/// Marker to denote that a linalg operation is to be partitioned to workgroups.
+StringRef getWorkGroupMarker();
+
 /// Marker to denote that a linalg operation is to be partitioned to workitems.
 StringRef getWorkItemMarker();
 
+/// Returns true if an operation has the specified `marker`. When `marker` is
+/// empty, returns true if the operation has any marker.
+bool hasMarker(Operation *, StringRef marker = "");
+
+/// Returns true if an operation has marker to denote that it is not to be
+/// tiled.
+bool hasNoTileMarker(Operation *);
+
+/// Returns true if an operation has marker to denote that it is to be
+/// partitioned to workgroups.
+bool hasWorkGroupMarker(Operation *);
+
+/// Returns true if an operation has marker to denote that it is to be
+/// partitioned to workitems.
+bool hasWorkItemMarker(Operation *);
+
 /// Returns true if an operation has a marker to denote that it will be mapped
 /// to cooperative matrix operations. Markers need to be consistent as
 /// cooperative matrices have their own type and load/store operations.
 bool hasCooperativeMatrixMarker(Operation *);
 
-/// Returns true if an operation has the specified `marker`. When `marker` is
-/// empty, returns true if the operation has any marker.
-bool hasMarker(Operation *, StringRef marker = "");
-
-/// Returns true if an operation has marker to denote that it is to be
-/// partitioned to workitems.
-bool hasWorkItemMarker(Operation *);
-
-/// Sets marker to denote that a vector operation is to be execute on a
-/// cooperative matrix.
-void setCooperativeMatrixMarker(Operation *);
-
 /// Sets a given marker on an operation.
 void setMarker(Operation *, StringRef);
 
+/// Sets marker to prevent tiling of a linalg operation.
+void setNoTileMarker(Operation *);
+
+/// Sets marker to denote that a linalg operation is to be partitioned to
+/// workgroups.
+void setWorkGroupMarker(Operation *);
+
 /// Sets marker to denote that a linalg operation is to be partitioned to
 /// workitems.
 void setWorkItemMarker(Operation *);
 
+/// Sets marker to denote that a vector operation is to be execute on a
+/// cooperative matrix.
+void setCooperativeMatrixMarker(Operation *);
+
 }  // namespace iree_compiler
 }  // namespace mlir
 
-#endif  // IREE_COMPILER_CONVERSION_LINALGTOSPIRV_MARKERUTILS_H_
+#endif  // IREE_COMPILER_CONVERSION_CODEGENUTILS_MARKERUTILS_H_

diff --git a/iree/compiler/Conversion/HLOToLinalg/HLOToLinalgOnBuffers.cpp b/iree/compiler/Conversion/HLOToLinalg/HLOToLinalgOnBuffers.cpp
index 01c0cba..6a54529 100644
--- a/iree/compiler/Conversion/HLOToLinalg/HLOToLinalgOnBuffers.cpp
+++ b/iree/compiler/Conversion/HLOToLinalg/HLOToLinalgOnBuffers.cpp

@@ -22,6 +22,7 @@
 
 #include <cstddef>
 
+#include "iree/compiler/Conversion/CodegenUtils/MarkerUtils.h"
 #include "iree/compiler/Conversion/HLOToLinalg/Passes.h"
 #include "iree/compiler/Dialect/HAL/IR/HALOps.h"
 #include "iree/compiler/Dialect/IREE/IR/IREEOps.h"
@@ -611,6 +612,8 @@
       cond ? rewriter.create<SelectOp>(loc, cond, inputVal, paddingVal)
            : inputVal;
   rewriter.create<linalg::YieldOp>(loc, result);
+
+  setNoTileMarker(linalgOp);
   return success();
 }
 

diff --git a/iree/compiler/Conversion/LinalgToSPIRV/BUILD b/iree/compiler/Conversion/LinalgToSPIRV/BUILD
index 70df4d6..d2308b4 100644
--- a/iree/compiler/Conversion/LinalgToSPIRV/BUILD
+++ b/iree/compiler/Conversion/LinalgToSPIRV/BUILD

@@ -23,18 +23,14 @@
         "ConvertToGPUPass.cpp",
         "ConvertToSPIRVPass.cpp",
         "LinalgTileAndFusePass.cpp",
-        "MarkerUtils.cpp",
         "Passes.cpp",
         "SplitDispatchFunctionPass.cpp",
-        "Utils.cpp",
         "VectorToGPUPass.cpp",
     ],
     hdrs = [
         "Attributes.h",
-        "MarkerUtils.h",
         "MemorySpace.h",
         "Passes.h",
-        "Utils.h",
     ],
     deps = [
         "//iree/compiler/Conversion/CodegenUtils",

diff --git a/iree/compiler/Conversion/LinalgToSPIRV/CMakeLists.txt b/iree/compiler/Conversion/LinalgToSPIRV/CMakeLists.txt
index b8821e2..ccc694c 100644
--- a/iree/compiler/Conversion/LinalgToSPIRV/CMakeLists.txt
+++ b/iree/compiler/Conversion/LinalgToSPIRV/CMakeLists.txt

@@ -19,18 +19,14 @@
     LinalgToSPIRV
   HDRS
     "Attributes.h"
-    "MarkerUtils.h"
     "MemorySpace.h"
     "Passes.h"
-    "Utils.h"
   SRCS
     "ConvertToGPUPass.cpp"
     "ConvertToSPIRVPass.cpp"
     "LinalgTileAndFusePass.cpp"
-    "MarkerUtils.cpp"
     "Passes.cpp"
     "SplitDispatchFunctionPass.cpp"
-    "Utils.cpp"
     "VectorToGPUPass.cpp"
   DEPS
     LLVMSupport

diff --git a/iree/compiler/Conversion/LinalgToSPIRV/ConvertToGPUPass.cpp b/iree/compiler/Conversion/LinalgToSPIRV/ConvertToGPUPass.cpp
index 9a4c203..8a85b6a 100644
--- a/iree/compiler/Conversion/LinalgToSPIRV/ConvertToGPUPass.cpp
+++ b/iree/compiler/Conversion/LinalgToSPIRV/ConvertToGPUPass.cpp

@@ -17,9 +17,8 @@
 // Partition computation within dispatch function to workgroups/workitems.
 //
 //===----------------------------------------------------------------------===//
-#include "iree/compiler/Conversion/LinalgToSPIRV/MarkerUtils.h"
+#include "iree/compiler/Conversion/CodegenUtils/MarkerUtils.h"
 #include "iree/compiler/Conversion/LinalgToSPIRV/Passes.h"
-#include "iree/compiler/Conversion/LinalgToSPIRV/Utils.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"
@@ -41,7 +40,7 @@
 // Loop utilities
 //===----------------------------------------------------------------------===//
 
-/// Builds an empty scf.for operation. The default builder adds an entry basic
+/// Builds an empty loop.for operation. The default builder adds an entry basic
 /// block which needs to be avoided here.
 static scf::ForOp buildEmptyForOp(Location loc, OpBuilder &builder, Value lb,
                                   Value ub, Value step) {
@@ -51,15 +50,6 @@
   return cast<scf::ForOp>(builder.createOperation(state));
 }
 
-/// Builds an empty scf.if operation without the then and else blocks.
-static scf::IfOp buildEmptyIfOp(Location loc, OpBuilder &builder, Value cond) {
-  OperationState state(loc, scf::IfOp::getOperationName());
-  state.addOperands(cond);
-  state.addRegion();
-  state.addRegion();
-  return cast<scf::IfOp>(builder.createOperation(state));
-}
-
 namespace {
 struct LoopBounds {
   Value lb;
@@ -68,10 +58,10 @@
 };
 }  // namespace
 
-/// Replaces a scf.parallelOp with an optional scf.parallel op and nested
-/// scf.for operations. To create the scf.parallel op as the outermost loop,
+/// Replaces a loop.parallelOp with an optional loop.parallel op and nested
+/// loop.for operations. To create the loop.parallel op as the outermost loop,
 /// pass the lower bound, upper bound and steps in `newPLoopLbs`, `newPLoopUbs`,
-/// and `newPLoopStep` respectively. The bounds of the inner scf.for operations
+/// and `newPLoopStep` respectively. The bounds of the inner loop.for operations
 /// to be created are passed in `forLbs`, `forUbs`, and `forStep`. The
 /// `permutation` vector contains a mapping from the original loop order, to the
 /// loop order to be generated.
@@ -80,21 +70,21 @@
                                  ArrayRef<LoopBounds> newPLoopBounds,
                                  ArrayRef<LoopBounds> forBounds,
                                  ArrayRef<unsigned> permutation) {
-  assert(!forBounds.empty() && "unhandled case of no scf.for created");
+  assert(!forBounds.empty() && "unhandled case of no loop.for created");
   unsigned numLoops = pLoopOp.getNumLoops();
   Location loc = pLoopOp.getLoc();
   assert(forBounds.size() + newPLoopBounds.size() == numLoops &&
-         "cannot drop loops when splitting scf.parallel operation");
+         "cannot drop loops when splitting loop.parallel operation");
   assert(permutation.size() == numLoops);
   OpBuilder::InsertionGuard guard(rewriter);
 
-  // Need a signature conversion for the body of the scf.parallel operation,
+  // Need a signature conversion for the body of the loop.parallel operation,
   // before can it can be used as the body of the innermost loop created here.
   TypeConverter::SignatureConversion signatureConverter(numLoops);
   Operation *outermostLoop = nullptr;
   auto permuteIt = permutation.begin();
 
-  // Create the scf.parallel operation as the outermost loop, if specified.
+  // Create the loop.parallel operation as the outermost loop, if specified.
   if (!newPLoopBounds.empty()) {
     auto lbs = llvm::to_vector<2>(llvm::map_range(
         newPLoopBounds, [](LoopBounds bounds) -> Value { return bounds.lb; }));
@@ -111,7 +101,7 @@
     outermostLoop = newPLoop.getOperation();
   }
 
-  // Generate the nested scf.for operations with the bounds passed.
+  // Generate the nested loop.for operations with the bounds passed.
   for (auto it : enumerate(forBounds)) {
     Value lb = it.value().lb, ub = it.value().ub, step = it.value().step;
     if (it.index() != forBounds.size() - 1) {
@@ -120,7 +110,7 @@
       signatureConverter.remapInput(*permuteIt, forOp.getInductionVar());
       rewriter.setInsertionPointToStart(forOp.getBody());
     } else {
-      // For the last loop, move the body of the scf.parallel op as the body of
+      // For the last loop, move the body of the loop.parallel op as the body of
       // the loop after signature conversion.
       auto forOp = buildEmptyForOp(loc, rewriter, lb, ub, step);
       if (!outermostLoop) outermostLoop = forOp.getOperation();
@@ -137,8 +127,8 @@
   return outermostLoop;
 }
 
-/// Serializes the dimensions of the scf.parallel specified in
-/// `serializedDimensions`, by creating an nested scf.for operation for each
+/// Serializes the dimensions of the loop.parallel specified in
+/// `serializedDimensions`, by creating an nested loop.for operation for each
 /// dimension.
 // TODO(ravishankarm): Move this into LoopUtils.h in MLIR.
 static Operation *serializeDimensions(ConversionPatternRewriter &rewriter,
@@ -151,7 +141,7 @@
   serializedDimSet.insert(serializedDimensions.begin(),
                           serializedDimensions.end());
   assert(serializedDimSet.size() == serializedDimensions.size() &&
-         "cannot repeat dimensions during serialization of scf.parallel");
+         "cannot repeat dimensions during serialization of loop.parallel");
   SmallVector<LoopBounds, 2> newPLoopBounds, forBounds;
   SmallVector<unsigned, 2> permutation;
   auto lbs = pLoopOp.lowerBound();
@@ -184,85 +174,16 @@
   return serializeDimensions(rewriter, pLoopOp, serializedDimensions);
 }
 
-/// Collapses all loops in a scf.parallel into one scf.parallel operation. This
-/// is done by
-/// 1) Normalize the loop bounds to be [0, (ub - lb) / step)
-/// 2) Compute the total number of iterations.
-/// 3) From the induction variable of the modified loop, compute the values of
-///    the original induction variables by de-linearization.
-scf::ParallelOp collapseParallelLoops(ConversionPatternRewriter &rewriter,
-                                      scf::ParallelOp pLoopOp) {
-  if (pLoopOp.getNumReductions()) return nullptr;
-
-  unsigned numLoops = pLoopOp.getNumLoops();
-  if (numLoops == 1) return pLoopOp;
-
-  // Compute the number of iterations of each loops starting from the innermost.
-  Location loc = pLoopOp.getLoc();
-  Value totalNumIterations = rewriter.create<ConstantIndexOp>(loc, 1);
-
-  // Track the "stride" of each loop, i.e. product of the total number of
-  // iterations of the inner loops.
-  SmallVector<Value, 2> iterationStride;
-  iterationStride.resize(pLoopOp.getNumLoops());
-  auto lbs = pLoopOp.lowerBound();
-  auto ubs = pLoopOp.upperBound();
-  auto steps = pLoopOp.step();
-  for (int i = numLoops - 1; i >= 0; --i) {
-    Value lb = lbs[i], ub = ubs[i], step = steps[i];
-    Value iterCount = rewriter.create<SignedDivIOp>(
-        loc, rewriter.create<SubIOp>(loc, ub, lb), step);
-    iterationStride[i] = totalNumIterations;
-    totalNumIterations =
-        rewriter.create<MulIOp>(loc, totalNumIterations, iterCount);
-  }
-
-  // Create the collapsed parallel loop op with lowerbound 0, step 1 and upper
-  // bound being the totalNumIterations.
-  Value newLb = rewriter.create<ConstantIndexOp>(loc, 0);
-  Value newStep = rewriter.create<ConstantIndexOp>(loc, 1);
-  scf::ParallelOp newPLoopOp =
-      rewriter.create<scf::ParallelOp>(loc, newLb, totalNumIterations, newStep);
-
-  // Build the body of the collapsed loop by cloning the original loop body. The
-  // replacement value of the induction variables of the original loop body,
-  // from the induction variable of the new loop, using
-  //   origLoopIv[i] = loopIv / iterationStride[i]
-  //   loopIv = loopIv % iterationStride[i]
-  OpBuilder::InsertionGuard guard(rewriter);
-  Block &pLoopBody = pLoopOp.getLoopBody().front();
-  rewriter.setInsertionPointToStart(&newPLoopOp.getLoopBody().front());
-  Value loopIv = *newPLoopOp.getInductionVars().begin();
-  BlockAndValueMapping map;
-  for (int i : llvm::seq<int>(0, numLoops)) {
-    Value iterNum =
-        rewriter.create<SignedDivIOp>(loc, loopIv, iterationStride[i]);
-    Value newIv = rewriter.create<AddIOp>(
-        loc, lbs[i], rewriter.create<MulIOp>(loc, iterNum, steps[i]));
-    map.map(pLoopBody.getArgument(i), newIv);
-    loopIv = rewriter.create<SignedRemIOp>(loc, loopIv, iterationStride[i]);
-  }
-  for (Operation &op : pLoopBody.without_terminator()) {
-    rewriter.clone(op, map);
-  }
-  rewriter.eraseOp(pLoopOp);
-  return newPLoopOp;
-}
-
 //===----------------------------------------------------------------------===//
 // GPU processor ID mapping utilities
 //===----------------------------------------------------------------------===//
 
-/// Distributes scf.parallel to processors with the processors logically
+/// Distribute loop.parallel to processors with the processors logically
 /// arranged with same dimensionality as the number of loops, i.e. a
-/// scf.parallel with 2 loops to a 2D grid of processors. `processorIDs` and
+/// loop.parallel with 2 loops to a 2D grid of processors. `processorIDs` and
 /// `numProcessors` must be of same size as the number of loops and are the
 /// values to use for process ID and number of processors along each dimension
 /// in the distributed code.
-/// This method accounts for the case where the number of processors is not
-/// enough to execute the entire iteration space with one iteration mapped to
-/// each processor. So implements a block-cyclic distribution with each block
-/// size being equal to the number of processors.
 static LogicalResult mapToProcessors(ConversionPatternRewriter &rewriter,
                                      scf::ParallelOp pLoopOp,
                                      ArrayRef<Value> processorIDs,
@@ -291,39 +212,6 @@
   return success();
 }
 
-/// Distributes scf.parallel to processors with the processors logically
-/// arranged with same dimensionality as the number of loops, i.e. a
-/// scf.parallel with 2 loops to a 2D grid of processors. `processorIDs` must be
-/// of same size as the number of loops and are the values to use for process ID
-/// and number of processors along each dimension in the distributed code.  This
-/// method assumes that the number of processors is greater than or equal to the
-/// number of iterations. So just generates an if statement to mask of
-/// processors with no work.
-static LogicalResult mapToProcessorsAndGuard(
-    ConversionPatternRewriter &rewriter, scf::ParallelOp pLoopOp,
-    ArrayRef<Value> processorIDs) {
-  unsigned numLoops = pLoopOp.getNumLoops();
-  Location loc = pLoopOp.getLoc();
-  assert(numLoops == processorIDs.size() &&
-         "expected as many ids as number of loops");
-  Value cond = nullptr;
-  TypeConverter::SignatureConversion signatureConverter(numLoops);
-  auto ubs = pLoopOp.upperBound();
-  for (unsigned i : llvm::seq<unsigned>(0, numLoops)) {
-    Value cmp = rewriter.create<CmpIOp>(loc, CmpIPredicate::slt,
-                                        processorIDs[i], ubs[i]);
-    cond = (cond ? rewriter.create<AndOp>(loc, cond, cmp) : cmp);
-    signatureConverter.remapInput(i, processorIDs[i]);
-  }
-  scf::IfOp ifOp = buildEmptyIfOp(loc, rewriter, cond);
-  Region &pLoopOpRegion = pLoopOp.getLoopBody();
-  rewriter.applySignatureConversion(&pLoopOpRegion, signatureConverter);
-  Region &ifOpRegion = ifOp.getRegion(0);
-  rewriter.inlineRegionBefore(pLoopOpRegion, ifOpRegion, ifOpRegion.begin());
-  rewriter.eraseOp(pLoopOp);
-  return success();
-}
-
 namespace {
 struct ProcessorIdAndCount {
   Value id;
@@ -363,24 +251,7 @@
           rewriter.create<MulIOp>(loc, blockDim, gridDim)};
 }
 
-template <typename GPUIdOp, typename GPUCountOp>
-static void getGPUProcessorIdsAndCounts(Location loc,
-                                        ConversionPatternRewriter &rewriter,
-                                        unsigned numDims,
-                                        MutableArrayRef<Value> id,
-                                        MutableArrayRef<Value> count) {
-  ArrayRef<StringRef> dims = {"x", "y", "z"};
-  assert(id.size() == numDims);
-  assert(count.size() == numDims);
-  for (unsigned i = 0; i < numDims; ++i) {
-    ProcessorIdAndCount idAndCount =
-        getGPUProcessorIdAndCount<GPUIdOp, GPUCountOp>(loc, dims[i], rewriter);
-    id[numDims - 1 - i] = idAndCount.id;
-    count[numDims - 1 - i] = idAndCount.count;
-  }
-}
-
-/// Distributes scf.parallel to processors where `IdOp` is used to get the
+/// Distribute loop.parallel to processors where `IdOp` is used to get the
 /// processor ID and `DimOp` is used to get the number of processors along a
 /// dimension.
 template <typename GPUIdOp, typename GPUCountOp>
@@ -392,51 +263,38 @@
         cast<scf::ParallelOp>(serializeDimensionsFrom(rewriter, pLoopOp, 3));
     numLoops = 3;
   }
-  SmallVector<Value, 2> id(numLoops), count(numLoops);
-  getGPUProcessorIdsAndCounts<GPUIdOp, GPUCountOp>(pLoopOp.getLoc(), rewriter,
-                                                   numLoops, id, count);
+  SmallVector<Value, 2> id, count;
+  id.reserve(numLoops);
+  count.reserve(numLoops);
+  ArrayRef<StringRef> dims = {"x", "y", "z"};
+  Location loc = pLoopOp.getLoc();
+  for (unsigned i = 0; i < numLoops; ++i) {
+    ProcessorIdAndCount idAndCount =
+        getGPUProcessorIdAndCount<GPUIdOp, GPUCountOp>(loc, dims[i], rewriter);
+    id.insert(id.begin(), idAndCount.id);
+    count.insert(count.begin(), idAndCount.count);
+  }
   return mapToProcessors(rewriter, pLoopOp, id, count);
 }
 
-/// Distributes scf.parallel to processors where `IdOp` is used to get the
-/// processor ID and `DimOp` is used to get the number of processors along a
-/// dimension. Assumes that the number of processors will be less than equal to
-/// the number of iterations of the pLoopOp along all dimensions.
-template <typename GPUIdOp, typename GPUCountOp>
-static LogicalResult mapToProcessorsAndGuard(
-    ConversionPatternRewriter &rewriter, scf::ParallelOp pLoopOp) {
-  unsigned numLoops = pLoopOp.getNumLoops();
-  if (numLoops > 3) {
-    pLoopOp =
-        cast<scf::ParallelOp>(serializeDimensionsFrom(rewriter, pLoopOp, 3));
-    numLoops = 3;
-  }
-  SmallVector<Value, 2> id(numLoops), count(numLoops);
-  getGPUProcessorIdsAndCounts<GPUIdOp, GPUCountOp>(pLoopOp.getLoc(), rewriter,
-                                                   numLoops, id, count);
-  return mapToProcessorsAndGuard(rewriter, pLoopOp, id);
-}
-
-/// Distribute the scf.parallel to workgroups.
+/// Distribute the loop.parallel to workgroups.
 static LogicalResult mapToWorkgroups(ConversionPatternRewriter &rewriter,
                                      scf::ParallelOp pLoopOp) {
   return mapToProcessor<gpu::BlockIdOp, gpu::GridDimOp>(rewriter, pLoopOp);
 }
 
-/// Distributes scf.parallel to workitems using local invocation ID.
+/// Distribute loop.parallel to workitems using local invocation ID.
 static LogicalResult mapToLocalInvocationId(ConversionPatternRewriter &rewriter,
                                             scf::ParallelOp pLoopOp) {
-  return mapToProcessorsAndGuard<gpu::ThreadIdOp, gpu::BlockDimOp>(rewriter,
-                                                                   pLoopOp);
+  return mapToProcessor<gpu::ThreadIdOp, gpu::BlockDimOp>(rewriter, pLoopOp);
 }
 
-/// Distributes scf.parallel to workitems using global invocation ID. The GPU
+/// Distribute loop.parallel to workitems using global invocation ID. The GPU
 /// dialect doesn't have a direct operation to do this. This could be done using
 /// id = blockIdx * blockDim + gridIdx. count = blockDim * gridDim.
 static LogicalResult mapToGlobalInvocationId(
     ConversionPatternRewriter &rewriter, scf::ParallelOp pLoopOp) {
-  return mapToProcessorsAndGuard<GPUGlobalId, GPUGlobalCount>(rewriter,
-                                                              pLoopOp);
+  return mapToProcessor<GPUGlobalId, GPUGlobalCount>(rewriter, pLoopOp);
 }
 
 //===----------------------------------------------------------------------===//
@@ -449,7 +307,7 @@
   void runOnFunction() override;
 };
 
-/// Pattern to map scf.parallel to workgroups.
+/// Pattern to map loop.parallel to workgroups.
 struct PartitionPLoopToWorkgroups
     : public OpConversionPattern<scf::ParallelOp> {
   using OpConversionPattern<scf::ParallelOp>::OpConversionPattern;
@@ -460,7 +318,7 @@
   }
 };
 
-/// Map tiled linalg op to workitems by lowering it to scf.parallel and
+/// Map tiled linalg op to workitems by lowering it to loop.parallel and
 /// partitioning it to workitems.
 template <typename LinalgOpTy>
 struct MapLinalgOpToLocalInvocationId : public OpConversionPattern<LinalgOpTy> {
@@ -493,29 +351,19 @@
   LogicalResult matchAndRewrite(
       LinalgOpTy linalgOp, ArrayRef<Value> operands,
       ConversionPatternRewriter &rewriter) const override {
-    // If marker exists do nothing.
-    if (hasMarker(linalgOp)) return failure();
+    // If marker exists and its not no-tile, do nothing.
+    if (hasMarker(linalgOp) && !hasNoTileMarker(linalgOp)) return failure();
     Optional<linalg::LinalgLoops> loops =
         linalg::linalgLowerOpToLoops<scf::ParallelOp>(rewriter, linalgOp);
     if (!loops) return failure();
-
-    SmallVector<int64_t, 3> workgroupSize(3, 1);
     if (!loops.getValue().empty()) {
       scf::ParallelOp pLoopOp = dyn_cast<scf::ParallelOp>(loops.getValue()[0]);
       // If there are parallel loops partition them to threads using global
       // invocation ID.
-      if (pLoopOp) {
-        pLoopOp = collapseParallelLoops(rewriter, pLoopOp);
-        if (!pLoopOp) return failure();
-        if (failed(mapToGlobalInvocationId(rewriter, pLoopOp)))
-          return rewriter.notifyMatchFailure(
-              linalgOp, "mapping to GlobalInvocationID failed");
-        workgroupSize = {32, 1, 1};
-      }
+      if (pLoopOp && failed(mapToGlobalInvocationId(rewriter, pLoopOp)))
+        return failure();
     }
     rewriter.eraseOp(linalgOp);
-    FuncOp funcOp = linalgOp.template getParentOfType<FuncOp>();
-    if (funcOp) updateWorkGroupSize(funcOp, workgroupSize);
     return success();
   }
 };
@@ -544,7 +392,7 @@
 
   MLIRContext *context = &getContext();
   ConversionTarget target(*context);
-  // After this pass Linalg and scf.parallel ops should be gone.
+  // After this pass Linalg and loop.parallel ops should be gone.
   target.addIllegalOp<scf::ParallelOp>();
   target.addIllegalDialect<linalg::LinalgDialect>();
   // Reshape ops are treated legal since they just change the way the underlying

diff --git a/iree/compiler/Conversion/LinalgToSPIRV/ConvertToSPIRVPass.cpp b/iree/compiler/Conversion/LinalgToSPIRV/ConvertToSPIRVPass.cpp
index 399b3f5..aeb0996 100644
--- a/iree/compiler/Conversion/LinalgToSPIRV/ConvertToSPIRVPass.cpp
+++ b/iree/compiler/Conversion/LinalgToSPIRV/ConvertToSPIRVPass.cpp

@@ -21,7 +21,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "iree/compiler/Conversion/LinalgToSPIRV/MarkerUtils.h"
+#include "iree/compiler/Conversion/CodegenUtils/MarkerUtils.h"
 #include "iree/compiler/Dialect/HAL/IR/HALOps.h"
 #include "iree/compiler/Dialect/IREE/IR/IREEOps.h"
 #include "llvm/ADT/STLExtras.h"

diff --git a/iree/compiler/Conversion/LinalgToSPIRV/LinalgTileAndFusePass.cpp b/iree/compiler/Conversion/LinalgToSPIRV/LinalgTileAndFusePass.cpp
index 7125f8b..48e1d57 100644
--- a/iree/compiler/Conversion/LinalgToSPIRV/LinalgTileAndFusePass.cpp
+++ b/iree/compiler/Conversion/LinalgToSPIRV/LinalgTileAndFusePass.cpp

@@ -17,10 +17,9 @@
 // Implements a pass to tile and fuse linalg operations on buffers.
 //
 //===----------------------------------------------------------------------===//
-#include "iree/compiler/Conversion/LinalgToSPIRV/MarkerUtils.h"
+#include "iree/compiler/Conversion/CodegenUtils/MarkerUtils.h"
 #include "iree/compiler/Conversion/LinalgToSPIRV/MemorySpace.h"
 #include "iree/compiler/Conversion/LinalgToSPIRV/Passes.h"
-#include "iree/compiler/Conversion/LinalgToSPIRV/Utils.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/Linalg/Utils/Utils.h"
@@ -72,6 +71,47 @@
       .size();
 }
 
+/// Updates the workgroup size used for the dispatch region.
+static LogicalResult updateWorkGroupSize(FuncOp funcOp,
+                                         ArrayRef<int64_t> workGroupSize) {
+  // Need to update both the surrounding FuncOp that has the spv.entry_point_abi
+  // attribute, and the hal.executable.
+  Region &body = funcOp.getBody();
+  if (!llvm::hasSingleElement(body))
+    return funcOp.emitError("unhandled dispatch function with multiple blocks");
+
+  SmallVector<int32_t, 3> workGroupSizeVec = llvm::to_vector<3>(llvm::map_range(
+      workGroupSize, [](int64_t v) { return static_cast<int32_t>(v); }));
+
+  // TODO(ravishankarm, antiagainst): We should have at most one scf.parallel
+  // op, but that is not the case till the splitting of kernels lands.
+  unsigned numParallelLoops = 0;
+  auto updateNumParallelLoops = [&numParallelLoops](unsigned nPar) {
+    numParallelLoops =
+        (!numParallelLoops ? nPar : std::min(numParallelLoops, nPar));
+  };
+  for (auto parallelLoop : body.front().getOps<scf::ParallelOp>()) {
+    updateNumParallelLoops(parallelLoop.getNumLoops());
+  }
+  // If there are no parallel loops, there might be linalg ops that arent
+  // tiled. Use that to get the number of parallel loops.
+  for (auto linalgOp : body.front().getOps<linalg::LinalgOp>()) {
+    updateNumParallelLoops(getNumOuterParallelLoops(linalgOp));
+  }
+  workGroupSizeVec.resize(numParallelLoops);
+  LLVM_DEBUG({
+    llvm::dbgs() << "--- IREE Linalg tile and fuse configuration ---\n";
+    llvm::dbgs() << "# workgroup sizes at end: [";
+    interleaveComma(workGroupSizeVec, llvm::dbgs());
+    llvm::dbgs() << "]\n";
+  });
+  MLIRContext *context = funcOp.getContext();
+  workGroupSizeVec.resize(3, 1);
+  funcOp.setAttr(spirv::getEntryPointABIAttrName(),
+                 spirv::getEntryPointABIAttr(workGroupSizeVec, context));
+  return success();
+}
+
 namespace {
 
 /// Computes tile sizes (and workgroup size) to use based on operations within
@@ -79,13 +119,7 @@
 class TileSizeCalculator {
  public:
   TileSizeCalculator(FuncOp funcOp)
-      : resourceLimits(spirv::lookupTargetEnv(funcOp).getResourceLimits()) {
-    if (DenseIntElementsAttr attr = spirv::lookupLocalWorkGroupSize(funcOp)) {
-      for (auto val : attr.getValues<APInt>())
-        workgroupSize.push_back(val.getSExtValue());
-    }
-    workgroupSize.resize(3, 1);
-  }
+      : resourceLimits(spirv::lookupTargetEnv(funcOp).getResourceLimits()) {}
 
   /// Compute the tile sizes based on workgroup size specified.
   LogicalResult setTileSizesBasedOnWorkgroupSize(
@@ -105,10 +139,21 @@
   /// Get the current tile size computed.
   ArrayRef<int64_t> getTileSizes() const { return tileSizes; }
 
+  /// Linalg convention is to use 0 for no tiling. If any of the tile dimensions
+  /// is set to 1 make it 0.
+  SmallVector<int64_t, 3> getTileSizesForLinalg() const {
+    return llvm::to_vector<3>(llvm::map_range(
+        tileSizes, [](int64_t v) -> int64_t { return v == 1 ? 0 : v; }));
+  }
+
   /// Returns the workgroup size to use based on the tile sizes.
   ArrayRef<int64_t> getWorkGroupSize() const { return workgroupSize; }
 
  private:
+  /// Get the default tile sizes based on just number of dimensions, i.e., "x",
+  /// "y", and "z".
+  void setTileSizesBasedOnDimensions(unsigned numDims);
+
   /// Current tile size configuration.
   SmallVector<int64_t, 4> tileSizes;
 
@@ -120,72 +165,67 @@
 };
 }  // namespace
 
+void TileSizeCalculator::setTileSizesBasedOnDimensions(unsigned numDims) {
+  tileSizes.clear();
+  workgroupSize.clear();
+  tileSizes.reserve(3);
+  if (numDims == 0) {
+    // Scalar case.
+    workgroupSize = {1, 1, 1};
+    return;
+  }
+  unsigned maxWorkGroupSize =
+      resourceLimits.max_compute_workgroup_invocations().getInt();
+
+  // Make the tile size 32 along the x-dimension, and then split the remaining
+  // maxWorkGroupSize threads amongst the y-dimension or z-dimension.
+  unsigned tileSizeX = llvm::PowerOf2Floor(std::min(maxWorkGroupSize, 32u));
+  maxWorkGroupSize /= tileSizeX;
+  if (numDims == 1) {
+    tileSizes = {tileSizeX};
+    workgroupSize = {tileSizeX, 1, 1};
+    return;
+  }
+  if (numDims == 2) {
+    unsigned tileSizeY = llvm::PowerOf2Floor(maxWorkGroupSize);
+    tileSizes = {tileSizeY, tileSizeX};
+    workgroupSize = {tileSizeX, tileSizeY, 1};
+    return;
+  }
+  unsigned tileSizeYZ =
+      llvm::PowerOf2Floor(static_cast<unsigned>(std::sqrt(maxWorkGroupSize)));
+  tileSizes = {tileSizeYZ, tileSizeYZ, tileSizeX};
+  workgroupSize = {tileSizeX, tileSizeYZ, tileSizeYZ};
+}
+
 LogicalResult TileSizeCalculator::setTileSizesBasedOnOps(
     ArrayRef<linalg::LinalgOp> linalgOps) {
   tileSizes.clear();
-  if (linalgOps.empty()) {
-    tileSizes = {1, 1, 1};
-    workgroupSize = {1, 1, 1};
-    return success();
-  }
   // The tile size will be driven by operations like matmul, conv, etc. within
   // the list. So see what operation exists in the list to decide the tile size.
   // If there are two such operations in the list, return error.
-  enum OpInfo : uint32_t {
-    None = 0x0,
-    Convolution = 0x1,
-    Matmul = 0x2,
-    Pooling = 0x4,
-  };
-  uint32_t opInfo = OpInfo::None;
-  for (linalg::LinalgOp linalgOp : linalgOps) {
-    Operation *op = linalgOp.getOperation();
-    if (isa<linalg::ConvOp>(op)) opInfo |= OpInfo::Convolution;
-    if (isa<linalg::MatmulOp>(op)) opInfo |= OpInfo::Matmul;
-    if (isa<linalg::PoolingMaxOp>(op)) opInfo |= OpInfo::Pooling;
-    if (isa<linalg::PoolingMinOp>(op)) opInfo |= OpInfo::Pooling;
-    if (isa<linalg::PoolingSumOp>(op)) opInfo |= OpInfo::Pooling;
+  bool hasMatmul = false;
+  unsigned numParallelLoops = kMaxWorkgroupRank;
+  for (linalg::LinalgOp op : linalgOps) {
+    // If there is no marker on this op (i.e. a marker to prevent tile), add an
+    // explicit marker to indicate that the op is to be tiled. Makes subsequent
+    // lowering simpler.
+    if (isa<linalg::MatmulOp>(op.getOperation())) {
+      if (hasMatmul)
+        return op.emitError(
+            "unhandled multiple matmuls within dispatch region");
+      hasMatmul = true;
+    }
+    numParallelLoops = std::min(numParallelLoops, getNumOuterParallelLoops(op));
   }
-  // If there are no tilable ops, there is nothing to do here.
-  if (!opInfo) return success();
-
-  Operation *linalgOp = *(linalgOps.begin());
-  if (llvm::countPopulation(opInfo) != 1)
-    return linalgOp->getParentOfType<FuncOp>().emitError(
-        "unhandled fusion of ops in dispatch function");
-
-  // TODO(ravishanarm, antiagainst): Only the maximum workgroup size is used
-  // here for computing tile sizes. In reality we also need the maximum
-  // workgroup memory size available (per workgroup) to compute the tile sizes
-  // effectively.
-  unsigned maxWorkgroupSize =
-      resourceLimits.max_compute_workgroup_invocations().getInt();
-  if (opInfo & OpInfo::Convolution) {
-    // TODO(ravishankarm): This tiling is meant to enable promotion to workgroup
-    // memory, but doesnt actually get us to a state where we can do this. The
-    // promotion is possible only when the subviews created are constant
-    // size. For now this doesnt really matter. Revisit this later.
-    int64_t tileSizeX = 32;
-    int64_t tileSizeY = maxWorkgroupSize / 32;
-    tileSizes = {1, tileSizeY, tileSizeX};
-    workgroupSize = {tileSizeX, tileSizeY, 1};
-    return success();
-  }
-  if (opInfo & OpInfo::Matmul) {
+  if (hasMatmul) {
     // TODO: For now just hard wire this, but we can do better.
     tileSizes = {8, 8, 4};
     workgroupSize = {8, 8, 1};
     return success();
   }
-  if (opInfo & OpInfo::Pooling) {
-    int64_t tileSizeX = 32;
-    int64_t tileSizeY = maxWorkgroupSize / 32;
-    tileSizes = {tileSizeY, tileSizeX};
-    workgroupSize = {tileSizeX, tileSizeY, 1};
-    return success();
-  }
-  return linalgOp->getParentOfType<FuncOp>().emitError(
-      "unable to find tile size for ops in this dispatch function");
+  setTileSizesBasedOnDimensions(numParallelLoops);
+  return success();
 }
 
 //===----------------------------------------------------------------------===//
@@ -254,41 +294,22 @@
   SmallVector<int64_t, 3> workGroupSize;
 };
 
-/// Pattern for tiling operations. Updates the workgroup size in the surrounding
-/// function operation if tiling succeeds.
-template <typename OpTy>
-struct TilingPattern : public linalg::LinalgTilingPattern<OpTy> {
-  using Base = linalg::LinalgTilingPattern<OpTy>;
-  TilingPattern(MLIRContext *context, linalg::LinalgTilingOptions options,
-                ArrayRef<int64_t> workgroupSize,
-                linalg::LinalgMarker marker = linalg::LinalgMarker(),
-                PatternBenefit benefit = 1)
-      : Base(context, options, marker, benefit),
-        workgroupSize(workgroupSize.begin(), workgroupSize.end()) {}
-
-  virtual LogicalResult matchAndRewrite(Operation *op,
-                                        PatternRewriter &rewriter) const {
-    // Find the parent FuncOp before tiling. If tiling succeeds, the op will be
-    // erased.
-    FuncOp funcOp = op->getParentOfType<FuncOp>();
-    return failure(!funcOp || failed(Base::matchAndRewrite(op, rewriter)) ||
-                   failed(updateWorkGroupSize(funcOp, workgroupSize)));
-  }
-
-  SmallVector<int64_t, 3> workgroupSize;
-};
-
-/// Pattern for tiling convolution and pooling operations. Currently is just a
-/// way to not tile when the operation has padding.
-template <typename OpTy>
-struct TileConvPoolPattern : public TilingPattern<OpTy> {
-  using Base = TilingPattern<OpTy>;
-  using Base::TilingPattern;
+/// Pattern to tile linalg operations if they have the workgroup marker.
+template <typename LinalgOp>
+struct TileLinalgOpPattern : public linalg::LinalgTilingPattern<LinalgOp> {
+  using linalg::LinalgTilingPattern<LinalgOp>::LinalgTilingPattern;
 
   LogicalResult matchAndRewrite(Operation *op,
                                 PatternRewriter &rewriter) const override {
-    if (cast<OpTy>(op).padding()) return failure();
-    return Base::matchAndRewrite(op, rewriter);
+    if (!hasWorkGroupMarker(op)) return failure();
+    if (succeeded(linalg::LinalgTilingPattern<LinalgOp>::matchAndRewrite(
+            op, rewriter)))
+      return success();
+    // Update the marker to map to global invocation ID.
+    rewriter.startRootUpdate(op);
+    setNoTileMarker(op);
+    rewriter.finalizeRootUpdate(op);
+    return success();
   }
 };
 
@@ -327,7 +348,14 @@
   auto linalgOps = block.getOps<linalg::LinalgOp>();
   if (linalgOps.empty()) return;
 
+  // Go through all the Linalg ops and set the marker to trigger tiling./
+  // TODO(ravishankarm): Move this to HLOToLinalgOnBuffers so that it is added
+  // on op-creation.
+  for (auto op : linalgOps)
+    if (!hasMarker(op)) setWorkGroupMarker(op);
+
   TileSizeCalculator tileSizeCalculator(funcOp);
+
   if (workGroupSize.empty()) {
     // Get the tile sizes to use for the lowering.
     SmallVector<int64_t, 3> tileSizes;
@@ -348,17 +376,20 @@
   });
 
   OwningRewritePatternList tilingPatterns;
-  tilingPatterns.insert<TileConvPoolPattern<linalg::ConvOp>,
-                        TilingPattern<linalg::MatmulOp>,
-                        TileConvPoolPattern<linalg::PoolingMaxOp>,
-                        TileConvPoolPattern<linalg::PoolingMinOp>,
-                        TileConvPoolPattern<linalg::PoolingSumOp>>(
+  tilingPatterns.insert<TileLinalgOpPattern<linalg::ConvOp>,
+                        TileLinalgOpPattern<linalg::CopyOp>,
+                        TileLinalgOpPattern<linalg::FillOp>,
+                        TileLinalgOpPattern<linalg::GenericOp>,
+                        TileLinalgOpPattern<linalg::IndexedGenericOp>,
+                        TileLinalgOpPattern<linalg::MatmulOp>,
+                        TileLinalgOpPattern<linalg::PoolingMaxOp>,
+                        TileLinalgOpPattern<linalg::PoolingMinOp>,
+                        TileLinalgOpPattern<linalg::PoolingSumOp>>(
       context,
       linalg::LinalgTilingOptions()
-          .setTileSizes(tileSizeCalculator.getTileSizes())
+          .setTileSizes(tileSizeCalculator.getTileSizesForLinalg())
           .setLoopType(linalg::LinalgTilingLoopType::ParallelLoops),
-      tileSizeCalculator.getWorkGroupSize(),
-      linalg::LinalgMarker(ArrayRef<Identifier>(),
+      linalg::LinalgMarker(Identifier::get(getWorkGroupMarker(), context),
                            Identifier::get(getWorkItemMarker(), context)));
   applyPatternsAndFoldGreedily(getOperation(), tilingPatterns);
 
@@ -392,6 +423,15 @@
       insertBarrierAfter(builder, linalgOp.getLoc(), linalgOp);
     }
   });
+
+  // Update the workgroup size to be consistent with the tile sizes used. Note
+  // the tile sizes are ordered from outer most to inner most loops. The
+  // heuristic is to map the inner loops to x, the next outer (if it exists) to
+  // y, and the next outer (if it exists) to z. So tile sizes are reversed to
+  // get the workgroup size.
+  if (failed(
+          updateWorkGroupSize(funcOp, tileSizeCalculator.getWorkGroupSize())))
+    return signalPassFailure();
 }
 
 //===----------------------------------------------------------------------===//

diff --git a/iree/compiler/Conversion/LinalgToSPIRV/SplitDispatchFunctionPass.cpp b/iree/compiler/Conversion/LinalgToSPIRV/SplitDispatchFunctionPass.cpp
index c996161..4ca5f2f 100644
--- a/iree/compiler/Conversion/LinalgToSPIRV/SplitDispatchFunctionPass.cpp
+++ b/iree/compiler/Conversion/LinalgToSPIRV/SplitDispatchFunctionPass.cpp

@@ -15,11 +15,11 @@
 //===- SplitDispathFunctionPass.cpp ---------------------------------------===//
 //
 // This file implements a pass to split computation workload to multiple
-// sequential dispatch functions. This pass operates on Linalg ops and
-// scf.parallel op and prepares for lowering to GPU, where we need to tile the
-// workload to workgroups and workitems. If the workload involves computation A
-// and B, where B is dependent on A and A needs all workgroups to complete, then
-// we need to split A and B into different kernels because there is no mechanism
+// sequential dispatch functions. This pass operates on Linalg ops and prepares
+// for lowering to GPU, where we need to tile the workload to workgroups and
+// workitems. If the workload involves computation A and B, where B is
+// dependent on A and A needs all workgroups to complete, then we need
+// to split A and B into different kernels because there is no mechanism
 // to perform cross-workgroup synchronization within a single kernel.
 //
 //===----------------------------------------------------------------------===//
@@ -35,7 +35,6 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
-#include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BlockAndValueMapping.h"
 #include "mlir/IR/Builders.h"
@@ -52,20 +51,24 @@
 
 namespace {
 
+/// Returns true if the given `block` contains 0 or 1 Linalg structured ops.
+bool hasZeroOrOneLinalgOp(Block &block) {
+  auto ops = block.getOps<linalg::LinalgOp>();
+  return std::distance(ops.begin(), ops.end()) <= 1;
+}
+
 /// Returns true if the Linalg ops can be separated to multiple kernels.
-bool canSeparateOps(ArrayRef<Operation *> ops) {
-  if (llvm::any_of(ops, [](Operation *op) {
-        if (auto linalgOp = dyn_cast<linalg::LinalgOp>(op))
-          return !linalgOp.hasBufferSemantics();
-        return false;
+bool canSeparateLinalgOps(MutableArrayRef<linalg::LinalgOp> linalgOps) {
+  if (llvm::any_of(linalgOps, [](linalg::LinalgOp op) {
+        return !op.hasBufferSemantics();
       }))
     return false;
 
   // Require no other ops interleave with Linalg structured ops for now. This is
   // the common case and it simplifies further analysis.
-  for (auto currOp = ops.begin(), nextOp = std::next(ops.begin());
-       nextOp != ops.end(); ++currOp, ++nextOp) {
-    if ((*currOp)->getNextNode() != *nextOp) return false;
+  for (int i = 0, e = linalgOps.size() - 1; i < e; ++i) {
+    if (linalgOps[i].getOperation()->getNextNode() != linalgOps[i + 1])
+      return false;
   }
 
   return true;
@@ -141,20 +144,15 @@
     return oldFn.emitError("expected only one block");
   }
 
-  // The dispatch function should have more than one separable ops.  Otherwise
-  // there is nothing to do.
-  Block &fnBody = oldFn.getBlocks().front();
+  // The dispatch function should have more than one Linalg structured ops.
+  // Otherwise there is nothing to do.
+  if (hasZeroOrOneLinalgOp(oldFn.getBlocks().front())) return success();
 
-  // Collect all Linalg and scf.parallel ops for distributing.
-  SmallVector<Operation *, 4> separableOps;
-  for (Operation &op : fnBody)
-    if (isa<linalg::LinalgOp>(op) || isa<scf::ParallelOp>(op))
-      separableOps.push_back(&op);
-
-  if (separableOps.size() <= 1) return success();
-  if (!canSeparateOps(separableOps)) {
-    return oldFn.emitError(
-        "cannot separate Linalg/Parallel ops into multiple kernels");
+  // Collect all Linalg ops for distributing.
+  SmallVector<linalg::LinalgOp, 4> linalgOps =
+      llvm::to_vector<4>(oldFn.getBlocks().front().getOps<linalg::LinalgOp>());
+  if (!canSeparateLinalgOps(linalgOps)) {
+    return oldFn.emitError("cannot separate Linalg ops into multiple kernels");
   }
 
   ModuleOp moduleOp = cast<ModuleOp>(oldFn.getParentOp());
@@ -162,13 +160,13 @@
   Location loc = oldFn.getLoc();
 
   SmallVector<std::string, 4> splitKernels;
-  splitKernels.reserve(separableOps.size());
+  splitKernels.reserve(linalgOps.size());
   llvm::SmallPtrSet<Operation *, 16> closure;
 
-  for (const auto &separableOp : llvm::enumerate(separableOps)) {
-    // Create a new function for hosting this op.
-    splitKernels.emplace_back(llvm::formatv("{0}_dispatch_{1}", oldFn.getName(),
-                                            separableOp.index()));
+  for (const auto &linalgOp : llvm::enumerate(linalgOps)) {
+    // Create a new function for hosting this Linalg op.
+    splitKernels.emplace_back(
+        llvm::formatv("{0}_dispatch_{1}", oldFn.getName(), linalgOp.index()));
     StringRef newFnName = splitKernels.back();
     builder.setInsertionPointToStart(moduleOp.getBody());
     auto newFn = builder.create<FuncOp>(loc, newFnName, oldFn.getType(),
@@ -183,7 +181,7 @@
 
     // Collect the closure for the current Linalg op.
     closure.clear();
-    collectAllReferencedOps(separableOp.value(), closure);
+    collectAllReferencedOps(linalgOp.value(), closure);
 
     // Clone all ops in the closure to the new function.
     Block *newFnBlock = newFn.addEntryBlock();
@@ -192,14 +190,14 @@
     for (Operation &op : oldFnBlock) {
       if (closure.count(&op) == 0) continue;
       builder.insert(op.clone(remapper));
-      if (&op == separableOp.value()) break;
+      if (&op == linalgOp.value()) break;
     }
     builder.insert(oldFnBlock.getTerminator()->clone(remapper));
   }
 
   // Add the entry point schedule to the module op.
   SmallVector<Attribute, 4> entryPoints;
-  entryPoints.reserve(separableOps.size());
+  entryPoints.reserve(linalgOps.size());
   for (const std::string &kernel : splitKernels) {
     entryPoints.emplace_back(builder.getStringAttr(kernel));
   }

diff --git a/iree/compiler/Conversion/LinalgToSPIRV/Utils.cpp b/iree/compiler/Conversion/LinalgToSPIRV/Utils.cpp
deleted file mode 100644
index 6fed42e..0000000
--- a/iree/compiler/Conversion/LinalgToSPIRV/Utils.cpp
+++ /dev/null

@@ -1,51 +0,0 @@
-// Copyright 2020 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//===- Utils.cpp - Utility functions used in Linalg to SPIR-V lowering ----===//
-//
-// Implementaiton of utility functions used while lowering from Linalg to SPIRV.
-//
-//===----------------------------------------------------------------------===//
-
-#include "iree/compiler/Conversion/LinalgToSPIRV/Utils.h"
-
-#include "mlir/Dialect/SPIRV/TargetAndABI.h"
-#include "mlir/IR/Function.h"
-#include "mlir/IR/Region.h"
-#include "mlir/Support/LogicalResult.h"
-
-namespace mlir {
-namespace iree_compiler {
-
-LogicalResult updateWorkGroupSize(FuncOp funcOp,
-                                  ArrayRef<int64_t> workGroupSize) {
-  // Need to update both the surrounding FuncOp that has the spv.entry_point_abi
-  // attribute, and the hal.executable.
-  Region &body = funcOp.getBody();
-  if (!llvm::hasSingleElement(body))
-    return funcOp.emitError("unhandled dispatch function with multiple blocks");
-
-  if (workGroupSize.size() != 3)
-    return funcOp.emitError("expected workgroup size to have three entries");
-  SmallVector<int32_t, 3> workGroupSizeVec = llvm::to_vector<3>(llvm::map_range(
-      workGroupSize, [](int64_t v) { return static_cast<int32_t>(v); }));
-
-  funcOp.setAttr(
-      spirv::getEntryPointABIAttrName(),
-      spirv::getEntryPointABIAttr(workGroupSizeVec, funcOp.getContext()));
-  return success();
-}
-
-}  // namespace iree_compiler
-}  // namespace mlir

diff --git a/iree/compiler/Conversion/LinalgToSPIRV/Utils.h b/iree/compiler/Conversion/LinalgToSPIRV/Utils.h
deleted file mode 100644
index bdea68e..0000000
--- a/iree/compiler/Conversion/LinalgToSPIRV/Utils.h
+++ /dev/null

@@ -1,38 +0,0 @@
-// Copyright 2020 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//===- Utils.h - Utility functions used in Linalg to SPIR-V lowering ------===//
-//
-// Utility functions used while lowering from Linalg to SPIRV.
-//
-//===----------------------------------------------------------------------===//
-#ifndef IREE_COMPILER_CONVERSION_LINALGTOSPIRV_UTILS_H_
-#define IREE_COMPILER_CONVERSION_LINALGTOSPIRV_UTILS_H_
-
-#include "mlir/Support/LLVM.h"
-
-namespace mlir {
-class FuncOp;
-struct LogicalResult;
-
-namespace iree_compiler {
-
-/// Updates the workgroup size used for the dispatch region.
-LogicalResult updateWorkGroupSize(FuncOp funcOp,
-                                  ArrayRef<int64_t> workGroupSize);
-
-}  // namespace iree_compiler
-}  // namespace mlir
-
-#endif  //  IREE_COMPILER_CONVERSION_LINALGTOSPIRV_UTILS_H_

diff --git a/iree/compiler/Conversion/LinalgToSPIRV/test/convert_to_gpu.mlir b/iree/compiler/Conversion/LinalgToSPIRV/test/convert_to_gpu.mlir
index 09e4101..9d81a75 100644
--- a/iree/compiler/Conversion/LinalgToSPIRV/test/convert_to_gpu.mlir
+++ b/iree/compiler/Conversion/LinalgToSPIRV/test/convert_to_gpu.mlir

@@ -1,201 +1,205 @@
-// RUN: iree-opt -iree-codegen-convert-to-gpu -canonicalize -cse -split-input-file %s | IreeFileCheck %s
+// RUN: iree-opt -iree-codegen-convert-to-gpu -canonicalize -split-input-file %s | IreeFileCheck %s
 
-#map0 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
-module attributes {
-  spv.target_env =
-    #spv.target_env<#spv.vce<v1.3,
-    [Shader], [SPV_KHR_storage_buffer_storage_class]>,
-    {max_compute_workgroup_invocations = 128 : i32,
-     max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
-  func @parallel_4D(%arg0: memref<?x?x?x?xf32>,
-                    %arg1 : memref<?x?x?x?xf32>,
-                    %arg2 : memref<?x?x?x?xf32>)
-  attributes {iree.dispatch_fn_name = "parallel_4D"} {
-    linalg.generic
-      {args_in = 2 : i64, args_out = 1 : i64,
-       indexing_maps = [#map0, #map0, #map0],
-       iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
-      %arg0, %arg1, %arg2 {
-    ^bb0(%arg3 : f32, %arg4 : f32, %arg5 : f32):
-      %0 = addf %arg3, %arg4 : f32
-      linalg.yield %0 : f32
-    } : memref<?x?x?x?xf32>, memref<?x?x?x?xf32>, memref<?x?x?x?xf32>
+#map0 = affine_map<(d0, d1, d2) -> (d0, d1 - d2)>
+#map1 = affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)>
+#map2 = affine_map<(d0, d1) -> (d0, d1)>
+
+module {
+  func @pw_add(%arg0: memref<4x8xi32>, %arg1: memref<4x8xi32>,
+               %arg2: memref<4x8xi32>)
+  attributes {iree.dispatch_fn_name = "pw_add"} {
+    %c32 = constant 32 : index
+    %c0 = constant 0 : index
+    %c4 = constant 4 : index
+    %c8 = constant 8 : index
+    %c1 = constant 1 : index
+    scf.parallel (%arg3, %arg4) = (%c0, %c0) to (%c4, %c8) step (%c4, %c32) {
+      %0 = affine.min #map0(%c4, %c4, %arg3)
+      %1 = affine.min #map0(%c32, %c8, %arg4)
+      %2 = subview %arg0[%arg3, %arg4] [%0, %1] [%c1, %c1]
+             : memref<4x8xi32> to memref<?x?xi32, #map1>
+      %3 = affine.min #map0(%c4, %c4, %arg3)
+      %4 = affine.min #map0(%c32, %c8, %arg4)
+      %5 = subview %arg1[%arg3, %arg4] [%3, %4] [%c1, %c1]
+             : memref<4x8xi32> to memref<?x?xi32, #map1>
+      %6 = affine.min #map0(%c4, %c4, %arg3)
+      %7 = affine.min #map0(%c32, %c8, %arg4)
+      %8 = subview %arg2[%arg3, %arg4] [%6, %7] [%c1, %c1]
+             : memref<4x8xi32> to memref<?x?xi32, #map1>
+      linalg.generic
+        {args_in = 2 : i64, args_out = 1 : i64,
+         indexing_maps = [#map2, #map2, #map2],
+         iterator_types = ["parallel", "parallel"]}
+      {__internal_linalg_transform__ = "workitem"} %2, %5, %8 {
+      ^bb0(%arg5: i32, %arg6: i32, %arg7: i32): // no predecessors
+        %9 = addi %arg5, %arg6 : i32
+        linalg.yield %9 : i32
+      } : memref<?x?xi32, #map1>, memref<?x?xi32, #map1>, memref<?x?xi32, #map1>
+      scf.yield
+    }
     return
   }
 }
-// CHECK-LABEL: func @parallel_4D
-//  CHECK-SAME:   local_size = dense<[32, 1, 1]>
-//   CHECK-DAG:     %[[C0:.+]] = constant 0 : index
-//   CHECK-DAG:     %[[C1:.+]] = constant 1 : index
-//   CHECK-DAG:     %[[C2:.+]] = constant 2 : index
-//   CHECK-DAG:     %[[C3:.+]] = constant 3 : index
-//   CHECK-DAG:     %[[UB0:.+]] = dim %{{.*}}, %[[C0]]
-//   CHECK-DAG:     %[[UB1:.+]] = dim %{{.*}}, %[[C1]]
-//   CHECK-DAG:     %[[UB2:.+]] = dim %{{.*}}, %[[C2]]
-//   CHECK-DAG:     %[[UB3:.+]] = dim %{{.*}}, %[[C3]]
-//       CHECK:     %[[T4:.+]] = muli %[[UB3]], %[[UB2]]
-//       CHECK:     %[[T5:.+]] = muli %[[T4]], %[[UB1]]
-//       CHECK:     %[[UB:.+]] = muli %[[T5]], %[[UB0]]
-//   CHECK-DAG:     %[[BID:.+]] = "gpu.block_id"() {dimension = "x"}
-//   CHECK-DAG:     %[[BDIM:.+]] = "gpu.block_dim"() {dimension = "x"}
-//   CHECK-DAG:     %[[TID:.+]] = "gpu.thread_id"() {dimension = "x"}
-//       CHECK:     %[[BOFFSET:.+]] = muli %[[BID]], %[[BDIM]]
-//       CHECK:     %[[IV:.+]] = addi %[[BOFFSET]], %[[TID]]
-//       CHECK:     %[[COND:.+]] = cmpi "slt", %[[IV]], %[[UB]]
-//       CHECK:     scf.if %[[COND]]
-//       CHECK:       %[[IV0:.+]] = divi_signed %[[IV]], %[[T5]]
-//       CHECK:       %[[T14:.+]] = remi_signed %[[IV]], %[[T5]]
-//       CHECK:       %[[IV1:.+]] = divi_signed %[[T14]], %[[T4]]
-//       CHECK:       %[[T16:.+]] = remi_signed %[[T14]], %[[T4]]
-//       CHECK:       %[[IV2:.+]] = divi_signed %[[T16]], %[[UB3]]
-//       CHECK:       %[[IV3:.+]] = remi_signed %[[T16]], %[[UB3]]
-//       CHECK:       load %{{.*}}[%[[IV0]], %[[IV1]], %[[IV2]], %[[IV3]]]
-//       CHECK:       load %{{.*}}[%[[IV0]], %[[IV1]], %[[IV2]], %[[IV3]]]
-//       CHECK:       store %{{.*}}[%[[IV0]], %[[IV1]], %[[IV2]], %[[IV3]]]
-
-
-// -----
-
-#map0 = affine_map<() -> ()>
-#accesses = [#map0, #map0, #map0]
-#trait = {
-  args_in = 2 : i64,
-  args_out = 1 : i64,
-  indexing_maps = #accesses,
-  iterator_types = []
-}
-
-module attributes {
-  spv.target_env =
-    #spv.target_env<#spv.vce<v1.3,
-    [Shader], [SPV_KHR_storage_buffer_storage_class]>,
-    {max_compute_workgroup_invocations = 128 : i32,
-     max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
-  func @scalar_add(%arg0 : memref<f32>, %arg1 : memref<f32>,
-                   %arg2 : memref<f32>)
-  {
-    linalg.generic #trait %arg0, %arg1, %arg2 {
-    ^bb0(%arg3 : f32, %arg4 : f32, %arg5 : f32):
-      %0 = addf %arg3, %arg4 : f32
-      linalg.yield %0 : f32
-     } : memref<f32>, memref<f32>, memref<f32>
-     return
-  }
-}
-// CHECK-LABEL: func @scalar_add
-//  CHECK-SAME:   local_size = dense<1> : vector<3xi32>
-//  CHECK-NEXT:     load
-//  CHECK-NEXT:     load
-//  CHECK-NEXT:     addf
-//  CHECK-NEXT:     store
-//  CHECK-NEXT:     return
+//   CHECK-DAG:   %[[STEPY:.+]] = constant 4 : index
+//   CHECK-DAG:   %[[STEPX:.+]] = constant 32 : index
+//   CHECK-DAG:   %[[BIDX:.+]] = "gpu.block_id"() {dimension = "x"}
+//   CHECK-DAG:   %[[NBLOCKSX:.+]] = "gpu.grid_dim"() {dimension = "x"}
+//   CHECK-DAG:   %[[BIDY:.+]] = "gpu.block_id"() {dimension = "y"}
+//   CHECK-DAG:   %[[NBLOCKSY:.+]] = "gpu.grid_dim"() {dimension = "y"}
+//       CHECK:   %[[NEWLBY:.+]] = muli %[[BIDY]], %[[STEPY]]
+//       CHECK:   %[[NEWSTEPY:.+]] = muli %[[NBLOCKSY]], %[[STEPY]]
+//       CHECK:   %[[NEWLBX:.+]] = muli %[[BIDX]], %[[STEPX]]
+//       CHECK:   %[[NEWSTEPX:.+]] = muli %[[NBLOCKSX]], %[[STEPX]]
+//       CHECK:   scf.for %{{.+}} = %[[NEWLBY]] to %{{.+}} step %[[NEWSTEPY]]
+//       CHECK:     scf.for %{{.+}} = %[[NEWLBX]] to %{{.+}} step %[[NEWSTEPX]]
+//   CHECK-DAG:       %[[TIDX:.+]] = "gpu.thread_id"() {dimension = "x"}
+//   CHECK-DAG:       %[[NTHREADSX:.+]] = "gpu.block_dim"() {dimension = "x"}
+//   CHECK-DAG:       %[[TIDY:.+]] = "gpu.thread_id"() {dimension = "y"}
+//   CHECK-DAG:       %[[NTHREADSY:.+]] = "gpu.block_dim"() {dimension = "y"}
+//       CHECK:       scf.for %{{.+}} = %[[TIDY]] to %{{.+}} step %[[NTHREADSY]]
+//       CHECK:         scf.for %{{.+}} = %[[TIDX]] to %{{.+}} step %[[NTHREADSX]]
 
 // -----
 
 module {
-  func @reduce_sum(%arg0: memref<?x?x?xf32>, %arg1: memref<f32>, %arg2: memref<?xf32>)
+  func @reduce_sum(%arg0: memref<4xf32>, %arg1: memref<f32>, %arg2: memref<f32>)
    attributes {iree.dispatch_fn_name = "reduce_sum"} {
     linalg.indexed_generic
       {args_in = 2 : i64, args_out = 1 : i64,
-       indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> ()>,
-                        affine_map<(d0, d1, d2) -> (d0)>],
-       iterator_types = ["parallel", "parallel", "reduction"]} %arg0, %arg1, %arg2 {
-    ^bb0(%arg3: index, %arg4: index, %arg5: index,
-         %arg6: f32, %arg7: f32, %arg8: f32):   // no predecessors
+       indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>,
+                        affine_map<(d0) -> ()>],
+       iterator_types = ["reduction"]} %arg0, %arg1, %arg2 {
+    ^bb0(%arg3: index, %arg4: f32, %arg5: f32, %arg6: f32):   // no predecessors
       %c0 = constant 0 : index
       %cst = constant true
-      %0 = cmpi "eq", %arg5, %c0 : index
+      %0 = cmpi "eq", %arg3, %c0 : index
       %1 = and %cst, %0 : i1
-      %2 = select %1, %arg7, %arg8 : f32
-      %3 = addf %arg6, %2 : f32
+      %2 = select %1, %arg5, %arg6 : f32
+      %3 = addf %arg4, %2 : f32
       linalg.yield %3 : f32
-    }: memref<?x?x?xf32>, memref<f32>, memref<?xf32>
+    }: memref<4xf32>, memref<f32>, memref<f32>
     return
   }
 }
-
-// CHECK-LABEL: func @reduce_sum
-//  CHECK-SAME:   local_size = dense<[32, 1, 1]> : vector<3xi32>
-//   CHECK-DAG:     %[[C0:.+]] = constant 0 : index
-//   CHECK-DAG:     %[[C1:.+]] = constant 1 : index
-//   CHECK-DAG:     %[[C2:.+]] = constant 2 : index
-//       CHECK:     %[[UB0:.+]] = dim %{{.*}}, %[[C0]]
-//       CHECK:     %[[UB1:.+]] = dim %{{.*}}, %[[C1]]
-//       CHECK:     %[[UB2:.+]] = dim %{{.*}}, %[[C2]]
-//       CHECK:     %[[UB:.+]] = muli %[[UB1]], %[[UB0]]
-//       CHECK:     %[[COND:.+]] = cmpi "slt", %{{.*}}, %[[UB]]
-//       CHECK:     scf.if %[[COND]]
-//       CHECK:       %[[IV0:.+]] = divi_signed %{{.*}}, %[[UB1]]
-//       CHECK:       %[[IV1:.+]] = remi_signed %{{.*}}, %[[UB1]]
-//       CHECK:       scf.for %[[IV:.+]] = %{{.*}} to %[[UB2]]
-//       CHECK:         %[[ISZERO:.+]] = cmpi "eq", %[[IV]], %[[C0]]
+// CHECK-DAG: %[[C0:.+]] = constant 0 : index
+// CHECK-DAG: %[[C4:.+]] = constant 4 : index
+// CHECK-DAG: %[[C1:.+]] = constant 1 : index
+//     CHECK:   scf.for %{{.+}} = %[[C0]] to %[[C4]] step %[[C1]]
+// CHECK-NOT:   scf
 
 // -----
 
-#map0 = affine_map<(d0)[s0] -> (8, -d0 + s0)>
-#map1 = affine_map<(d0)[s0] -> (4, -d0 + s0)>
-#map2 = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>
+#map0 = affine_map<(d0)[s0] -> (2, -d0 + s0)>
+#map1 = affine_map<(d0)[s0] -> (32, -d0 + s0)>
+#map2 = affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>
+#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 
-module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
-  func @matmul(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: memref<?x?xf32>) attributes {spv.entry_point_abi = {local_size = dense<[8, 8, 1]> : vector<3xi32>}} {
+module {
+  func @parallel_4D(%arg0: memref<?x?x?x?xf32>, %arg1: memref<?x?x?x?xf32>, %arg2: memref<?x?x?x?xf32>) attributes {iree.dispatch_fn_name = "parallel_4D", spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
+    %c2 = constant 2 : index
+    %c32 = constant 32 : index
     %c0 = constant 0 : index
     %c1 = constant 1 : index
-    %c4 = constant 4 : index
-    %c8 = constant 8 : index
-    %0 = dim %arg0, %c0 : memref<?x?xf32>
-    %1 = dim %arg0, %c1 : memref<?x?xf32>
-    %2 = dim %arg1, %c1 : memref<?x?xf32>
-    scf.parallel (%arg3, %arg4) = (%c0, %c0) to (%0, %2) step (%c8, %c8) {
-      scf.for %arg5 = %c0 to %1 step %c4 {
-        %3 = affine.min #map0(%arg3)[%0]
-        %4 = affine.min #map1(%arg5)[%1]
-        %5 = subview %arg0[%arg3, %arg5] [%3, %4] [1, 1]  : memref<?x?xf32> to memref<?x?xf32, #map2>
-        %6 = dim %arg1, %c0 : memref<?x?xf32>
-        %7 = affine.min #map1(%arg5)[%6]
-        %8 = affine.min #map0(%arg4)[%2]
-        %9 = subview %arg1[%arg5, %arg4] [%7, %8] [1, 1]  : memref<?x?xf32> to memref<?x?xf32, #map2>
-        %10 = dim %arg2, %c0 : memref<?x?xf32>
-        %11 = affine.min #map0(%arg3)[%10]
-        %12 = dim %arg2, %c1 : memref<?x?xf32>
-        %13 = affine.min #map0(%arg4)[%12]
-        %14 = subview %arg2[%arg3, %arg4] [%11, %13] [1, 1]  : memref<?x?xf32> to memref<?x?xf32, #map2>
-        linalg.matmul %5, %9, %14 {__internal_linalg_transform__ = "workitem"} : (memref<?x?xf32, #map2>, memref<?x?xf32, #map2>, memref<?x?xf32, #map2>)
-      }
+    %c3 = constant 3 : index
+    %0 = dim %arg0, %c0 : memref<?x?x?x?xf32>
+    %1 = dim %arg0, %c1 : memref<?x?x?x?xf32>
+    %2 = dim %arg0, %c2 : memref<?x?x?x?xf32>
+    %3 = dim %arg0, %c3 : memref<?x?x?x?xf32>
+    scf.parallel (%arg3, %arg4, %arg5, %arg6) = (%c0, %c0, %c0, %c0) to (%0, %1, %2, %3) step (%c2, %c2, %c2, %c32) {
+      %12 = affine.min #map0(%arg3)[%0]
+      %13 = affine.min #map0(%arg4)[%1]
+      %14 = affine.min #map0(%arg5)[%2]
+      %15 = affine.min #map1(%arg6)[%3]
+      %16 = subview %arg0[%arg3, %arg4, %arg5, %c0] [%12, %13, %14, %15] [%c1, %c1, %c1, %c1] : memref<?x?x?x?xf32> to memref<?x?x?x?xf32, #map2>
+      %17 = subview %arg1[%arg3, %arg4, %arg5, %c0] [%12, %13, %14, %15] [%c1, %c1, %c1, %c1] : memref<?x?x?x?xf32> to memref<?x?x?x?xf32, #map2>
+      %18 = subview %arg2[%arg3, %arg4, %arg5, %c0] [%12, %13, %14, %15] [%c1, %c1, %c1, %c1] : memref<?x?x?x?xf32> to memref<?x?x?x?xf32, #map2>
+      linalg.generic {args_in = 2 : i64, args_out = 1 : i64,
+        indexing_maps = [#map3, #map3, #map3],
+        iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
+      {__internal_linalg_transform__ = "workitem"}
+      %16, %17, %18
+      {
+        ^bb0(%arg7: f32, %arg8: f32, %arg9: f32): // no predecessors
+          %19 = addf %arg7, %arg8 : f32
+          linalg.yield %19 : f32
+      } : memref<?x?x?x?xf32, #map2>, memref<?x?x?x?xf32, #map2>, memref<?x?x?x?xf32, #map2>
       scf.yield
     }
     return
   }
 }
 
-// CHECK-LABEL: func @matmul
-//  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9$._-]+]]: memref<?x?xf32>
-//  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9$._-]+]]: memref<?x?xf32>
-//  CHECK-SAME:   %[[ARG2:[a-zA-Z0-9$._-]+]]: memref<?x?xf32>
-//   CHECK-DAG:   %[[C4:.+]] = constant 4 : index
-//   CHECK-DAG:   %[[C8:.+]] = constant 8 : index
-//   CHECK-DAG:   %[[C0:.+]] = constant 0 : index
-//   CHECK-DAG:   %[[C1:.+]] = constant 1 : index
-//   CHECK-DAG:   %[[UB0:.+]] = dim %[[ARG0]], %[[C0]]
-//   CHECK-DAG:   %[[UB1:.+]] = dim %[[ARG1]], %[[C1]]
-//   CHECK-DAG:   %[[UB2:.+]] = dim %[[ARG0]], %[[C1]]
-//   CHECK-DAG:   %[[BIDX:.+]] = "gpu.block_id"() {dimension = "x"}
-//   CHECK-DAG:   %[[GDIMX:.+]] = "gpu.grid_dim"() {dimension = "x"}
-//   CHECK-DAG:   %[[BIDY:.+]] = "gpu.block_id"() {dimension = "y"}
-//   CHECK-DAG:   %[[GDIMY:.+]] = "gpu.grid_dim"() {dimension = "y"}
-//       CHECK:   %[[BOFFSETY:.+]] = muli %[[BIDY]], %[[C8]]
-//       CHECK:   %[[BSTEPY:.+]] = muli %[[GDIMY]], %[[C8]]
-//       CHECK:   %[[BOFFSETX:.+]] = muli %[[BIDX]], %[[C8]]
-//       CHECK:   %[[BSTEPX:.+]] = muli %[[GDIMX]], %[[C8]]
-//       CHECK:   scf.for %[[BIV0:.+]] = %[[BOFFSETY]] to %[[UB0]] step %[[BSTEPY]]
-//       CHECK:     scf.for %[[BIV1:.+]] = %[[BOFFSETX]] to %[[UB1]] step %[[BSTEPX]]
-//       CHECK:       scf.for %[[BIV2:.+]] = %[[C0]] to %[[UB2]] step %[[C4]]
-//   CHECK-DAG:         %[[VIEWUB0:.+]] = affine.min #{{.*}}(%[[BIV0]])[%[[UB0]]]
-//   CHECK-DAG:         %[[VIEWUB1:.+]] = affine.min #{{.*}}(%[[BIV1]])[%[[UB1]]]
-//   CHECK-DAG:         %[[VIEWUB2:.+]] = affine.min #{{.*}}(%[[BIV2]])[%[[UB2]]]
-//   CHECK-DAG:         %[[TIDX:.+]] = "gpu.thread_id"() {dimension = "x"}
-//   CHECK-DAG:         %[[TIDY:.+]] = "gpu.thread_id"() {dimension = "y"}
-//       CHECK:         %[[INBOUNDY:.+]] = cmpi "slt", %[[TIDY]], %[[VIEWUB0]]
-//       CHECK:         %[[INBOUNDX:.+]] = cmpi "slt", %[[TIDX]], %[[VIEWUB1]]
-//       CHECK:         %[[COND:.+]] = and %[[INBOUNDY]], %[[INBOUNDX]]
-//       CHECK:         scf.if %[[COND]]
-//       CHECK:           scf.for %{{.*}} = %[[C0]] to %[[VIEWUB2]] step %[[C1]]
+// CHECK-DAG: %[[C2:.+]] = constant 2 : index
+// CHECK-DAG: %[[C32:.+]] = constant 32 : index
+// CHECK-DAG: %[[C0:.+]] = constant 0 : index
+// CHECK-DAG: %[[C1:.+]] = constant 1 : index
+// CHECK-DAG: %[[C3:.+]] = constant 3 : index
+// CHECK-DAG: %[[SERIALDIMOUTER:.+]] = dim %{{.+}}, %[[C3]]
+// CHECK-DAG: %[[BIDX:.+]] = "gpu.block_id"() {dimension = "x"} : () -> index
+// CHECK-DAG: %[[NBLOCKSX:.+]] = "gpu.grid_dim"() {dimension = "x"} : () -> index
+// CHECK-DAG: %[[BIDY:.+]] = "gpu.block_id"() {dimension = "y"} : () -> index
+// CHECK-DAG: %[[NBLOCKSY:.+]] = "gpu.grid_dim"() {dimension = "y"} : () -> index
+// CHECK-DAG: %[[BIDZ:.+]] = "gpu.block_id"() {dimension = "z"} : () -> index
+// CHECK-DAG: %[[NBLOCKSZ:.+]] = "gpu.grid_dim"() {dimension = "z"} : () -> index
+// CHECK-DAG: %[[LB0:.+]] = muli %[[BIDZ]], %[[C2]]
+// CHECK-DAG: %[[STEP0:.+]] = muli %[[NBLOCKSZ]], %[[C2]]
+// CHECK-DAG: %[[LB1:.+]] = muli %[[BIDY]], %[[C2]]
+// CHECK-DAG: %[[STEP1:.+]] = muli %[[NBLOCKSY]], %[[C2]]
+// CHECK-DAG: %[[LB2:.+]] = muli %[[BIDX]], %[[C2]]
+// CHECK-DAG: %[[STEP2:.+]] = muli %[[NBLOCKSX]], %[[C2]]
+//     CHECK: scf.for %{{.+}} = %[[LB0]] to %{{.+}} step %[[STEP0]]
+//     CHECK:   scf.for %{{.+}} = %[[LB1]] to %{{.+}} step %[[STEP1]]
+//     CHECK:     scf.for %{{.+}} = %[[LB2]] to %{{.+}} step %[[STEP2]]
+//     CHECK:       scf.for %{{.+}} = %[[C0]] to %[[SERIALDIMOUTER]] step %[[C32]]
+// CHECK-DAG:         %[[TIDX:.+]] = "gpu.thread_id"() {dimension = "x"} : () -> index
+// CHECK-DAG:         %[[NTHREADSX:.+]] = "gpu.block_dim"() {dimension = "x"} : () -> index
+// CHECK-DAG:         %[[TIDY:.+]] = "gpu.thread_id"() {dimension = "y"} : () -> index
+// CHECK-DAG:         %[[NTHREADSY:.+]] = "gpu.block_dim"() {dimension = "y"} : () -> index
+// CHECK-DAG:         %[[TIDZ:.+]] = "gpu.thread_id"() {dimension = "z"} : () -> index
+// CHECK-DAG:         %[[NTHREADSZ:.+]] = "gpu.block_dim"() {dimension = "z"} : () -> index
+//     CHECK:         scf.for %{{.+}} = %[[TIDZ]] to %{{.+}} step %[[NTHREADSZ]]
+//     CHECK:           scf.for %{{.+}} = %[[TIDY]] to %{{.+}} step %[[NTHREADSY]]
+//     CHECK:             scf.for %{{.+}} = %[[TIDX]] to %{{.+}} step %[[NTHREADSX]]
+//     CHECK:               scf.for %{{.+}} = %[[C0]] to %{{.+}} step %[[C1]]
+
+// -----
+
+module {
+  func @no_tile(%arg0: memref<?x?xf32>, %arg1 : memref<?x?xf32>,
+                %arg2 : memref<?x?xf32>)
+  attributes {iree.dispatch_fn_name = "reduce_sum"} {
+    linalg.generic
+      {args_in = 2 : i64, args_out = 1 : i64,
+       indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+                        affine_map<(d0, d1) -> (d0, d1)>,
+                        affine_map<(d0, d1) -> (d0, d1)>],
+       iterator_types = ["parallel", "parallel"]}
+      {__internal_linalg_tranform__ = "no-tile"} %arg0, %arg1, %arg2 {
+    ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
+      %0 = addf %arg3, %arg4 : f32
+      linalg.yield %0 : f32
+    }: memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>
+    return
+  }
+}
+
+// CHECK-DAG: %[[C0:.*]] = constant 0 : index
+// CHECK-DAG: %[[C1:.*]] = constant 1 : index
+// CHECK-DAG: %[[UBY:.+]] =  dim %{{.*}}, %[[C0]]
+// CHECK-DAG: %[[UBX:.+]] =  dim %{{.*}}, %[[C1]]
+// CHECK-DAG: %[[NBLOCKSX:.+]] = "gpu.grid_dim"() {dimension = "x"}
+// CHECK-DAG: %[[BIDX:.+]] = "gpu.block_id"() {dimension = "x"}
+// CHECK-DAG: %[[BLOCKSIZEX:.+]] = "gpu.block_dim"() {dimension = "x"}
+// CHECK-DAG: %[[TIDX:.+]] = "gpu.thread_id"() {dimension = "x"}
+//     CHECK: %[[T6:.+]] =  muli %[[BIDX]], %[[BLOCKSIZEX]]
+//     CHECK: %[[GIDX:.+]] =  addi %[[T6]], %[[TIDX]]
+//     CHECK: %[[NPROCSX:.+]] =  muli %[[BLOCKSIZEX]], %[[NBLOCKSX]]
+// CHECK-DAG: %[[NBLOCKSY:.+]] = "gpu.grid_dim"() {dimension = "y"}
+// CHECK-DAG: %[[BIDY:.+]] = "gpu.block_id"() {dimension = "y"}
+// CHECK-DAG: %[[BLOCKSIZEY:.+]] = "gpu.block_dim"() {dimension = "y"}
+// CHECK-DAG: %[[TIDY:.+]] = "gpu.thread_id"() {dimension = "y"}
+//     CHECK: %[[T6:.+]] =  muli %[[BIDY]], %[[BLOCKSIZEY]]
+//     CHECK: %[[GIDY:.+]] =  addi %[[T6]], %[[TIDY]]
+//     CHECK: %[[NPROCSY:.+]] =  muli %[[BLOCKSIZEY]], %[[NBLOCKSY]]
+//     CHECK: scf.for %{{.+}} = %[[GIDY]] to %[[UBY]] step %[[NPROCSY]]
+//     CHECK:   scf.for %{{.+}} = %[[GIDX]] to %[[UBX]] step %[[NPROCSX]]

diff --git a/iree/compiler/Conversion/LinalgToSPIRV/test/linalg_tile_and_fuse.mlir b/iree/compiler/Conversion/LinalgToSPIRV/test/linalg_tile_and_fuse.mlir
index 1b5ddda..70f3d17 100644
--- a/iree/compiler/Conversion/LinalgToSPIRV/test/linalg_tile_and_fuse.mlir
+++ b/iree/compiler/Conversion/LinalgToSPIRV/test/linalg_tile_and_fuse.mlir

@@ -1,14 +1,65 @@
 // RUN: iree-opt -split-input-file -iree-codegen-linalg-tile-and-fuse %s | IreeFileCheck %s
 
-// Test to check that convolution with padding is not tiled.
 module attributes {
   spv.target_env =
     #spv.target_env<#spv.vce<v1.3,
     [Shader], [SPV_KHR_storage_buffer_storage_class]>,
     {max_compute_workgroup_invocations = 128 : i32,
      max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
+  // CHECK-LABEL: func @tile_only
+  //  CHECK-SAME: %[[ARG0:[a-zA-Z0-9$._-]+]]: memref<4x8xi32>
+  //  CHECK-SAME: %[[ARG1:[a-zA-Z0-9$._-]+]]: memref<4x8xi32>
+  //  CHECK-SAME: %[[ARG2:[a-zA-Z0-9$._-]+]]: memref<4x8xi32>
+  //  CHECK-SAME: local_size = dense<[32, 4, 1]>
+  //       CHECK: scf.parallel
+  //       CHECK:   %[[VIEW0:.+]] = subview %[[ARG0]]
+  //       CHECK:   %[[VIEW1:.+]] = subview %[[ARG1]]
+  //       CHECK:   %[[VIEW2:.+]] = subview %[[ARG2]]
+  //       CHECK:   linalg.generic
+  //  CHECK-SAME:     "workitem"
+  //  CHECK-SAME:     %[[VIEW0]]
+  //  CHECK-SAME:     %[[VIEW1]]
+  //  CHECK-SAME:     %[[VIEW2]]
+  func @tile_only(%arg0: memref<4x8xi32>, %arg1: memref<4x8xi32>,
+                  %arg2: memref<4x8xi32>) {
+    linalg.generic
+      {args_in = 2 : i64, args_out = 1 : i64,
+       indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+                        affine_map<(d0, d1) -> (d0, d1)>,
+                        affine_map<(d0, d1) -> (d0, d1)>],
+       iterator_types = ["parallel", "parallel"]} %arg0, %arg1, %arg2 {
+    ^bb0(%arg3: i32, %arg4: i32, %arg5: i32):
+      %0 = addi %arg3, %arg4 : i32
+      linalg.yield %0 : i32
+    }: memref<4x8xi32>, memref<4x8xi32>, memref<4x8xi32>
+    return
+  }
+}
+
+// -----
+
+module attributes {
+  spv.target_env =
+    #spv.target_env<#spv.vce<v1.3,
+    [Shader], [SPV_KHR_storage_buffer_storage_class]>,
+    {max_compute_workgroup_invocations = 128 : i32,
+     max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
+  // CHECK-LABEL: func @conv_padding
+  //  CHECK-SAME: %[[ARG0:[a-zA-Z0-9$._-]+]]: memref<?x?x?x?xf32>
+  //  CHECK-SAME: %[[ARG1:[a-zA-Z0-9$._-]+]]: memref<?x?x?x?xf32>
+  //  CHECK-SAME: %[[ARG2:[a-zA-Z0-9$._-]+]]: memref<?x?x?x?xf32>
+  //  CHECK-SAME: local_size = dense<[32, 1, 1]>
+  //       CHECK: scf.parallel (%{{.+}})
+  //       CHECK:   %[[VIEW1:.+]] = subview %[[ARG1]]
+  //       CHECK:   %[[VIEW2:.+]] = subview %[[ARG2]]
+  //       CHECK:   linalg.conv
+  //  CHECK-SAME:     %[[VIEW1]]
+  //  CHECK-SAME:     %[[VIEW2]]
+  //  CHECK-SAME:     "workitem"
   func @conv_padding(%arg0 : memref<?x?x?x?xf32>, %arg1 : memref<?x?x?x?xf32>,
-                     %arg2 : memref<?x?x?x?xf32>) {
+                     %arg2 : memref<?x?x?x?xf32>)
+    attributes
+      {iree.dispatch_fn_name = "conv_padding"} {
     linalg.conv(%arg0, %arg1, %arg2)
       {dilations = [1, 1],
        padding = dense<[[1, 1], [0, 1]]> : tensor<2x2xi64>, strides = [1, 1]} :
@@ -16,14 +67,6 @@
     return
   }
 }
-// CHECK-LABEL: func @conv_padding
-//  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9$._-]+]]: memref<?x?x?x?xf32>
-//  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9$._-]+]]: memref<?x?x?x?xf32>
-//  CHECK-SAME:   %[[ARG2:[a-zA-Z0-9$._-]+]]: memref<?x?x?x?xf32>
-//       CHECK:   linalg.conv
-//  CHECK-SAME:     %[[ARG0]]
-//  CHECK-SAME:     %[[ARG1]]
-//  CHECK-SAME:     %[[ARG2]]
 
 // -----
 
@@ -33,24 +76,55 @@
     [Shader], [SPV_KHR_storage_buffer_storage_class]>,
     {max_compute_workgroup_invocations = 128 : i32,
      max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
+  // CHECK-LABEL: func @conv_no_padding
+  //  CHECK-SAME: %[[ARG0:[a-zA-Z0-9$._-]+]]: memref<?x?x?x?xf32>
+  //  CHECK-SAME: %[[ARG1:[a-zA-Z0-9$._-]+]]: memref<?x?x?x?xf32>
+  //  CHECK-SAME: %[[ARG2:[a-zA-Z0-9$._-]+]]: memref<?x?x?x?xf32>
+  //  CHECK-SAME: local_size = dense<[32, 2, 2]>
+  //       CHECK: scf.parallel (%{{.+}}, %{{.+}}, %{{.+}})
+  //       CHECK:   %[[VIEW1:.+]] = subview %[[ARG1]]
+  //       CHECK:   %[[VIEW2:.+]] = subview %[[ARG2]]
+  //       CHECK:   linalg.conv
+  //  CHECK-SAME:     %[[VIEW1]]
+  //  CHECK-SAME:     %[[VIEW2]]
+  //  CHECK-SAME:     "workitem"
   func @conv_no_padding(%arg0 : memref<?x?x?x?xf32>, %arg1 : memref<?x?x?x?xf32>,
-                        %arg2 : memref<?x?x?x?xf32>) {
+                        %arg2 : memref<?x?x?x?xf32>)
+    attributes
+      {iree.dispatch_fn_name = "conv_no_padding"} {
     linalg.conv(%arg0, %arg1, %arg2) {dilations = [1, 1], strides = [1, 1]} :
       memref<?x?x?x?xf32>, memref<?x?x?x?xf32>, memref<?x?x?x?xf32>
     return
   }
 }
-// CHECK-LABEL: func @conv_no_padding
-//  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9$._-]+]]: memref<?x?x?x?xf32>
-//  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9$._-]+]]: memref<?x?x?x?xf32>
-//  CHECK-SAME:   %[[ARG2:[a-zA-Z0-9$._-]+]]: memref<?x?x?x?xf32>
-//  CHECK-SAME:   local_size = dense<[32, 4, 1]>
-//       CHECK:   scf.parallel (%{{.+}}, %{{.+}}, %{{.+}})
-//       CHECK:     %[[VIEW1:.+]] = subview %[[ARG1]]
-//       CHECK:     %[[VIEW2:.+]] = subview %[[ARG2]]
-//       CHECK:     linalg.conv
-//  CHECK-SAME:       %[[ARG0]], %[[VIEW1]], %[[VIEW2]]
-//  CHECK-SAME:       "workitem"
+
+// -----
+
+#map0 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+module attributes {
+  spv.target_env =
+    #spv.target_env<#spv.vce<v1.3,
+    [Shader], [SPV_KHR_storage_buffer_storage_class]>,
+    {max_compute_workgroup_invocations = 128 : i32,
+     max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
+  // CHECK-LABEL: func @parallel_4D
+  //       CHECK: scf.parallel (%{{.+}}, %{{.+}}, %{{.+}})
+  func @parallel_4D(%arg0: memref<?x?x?x?xf32>,
+                    %arg1 : memref<?x?x?x?xf32>,
+                    %arg2 : memref<?x?x?x?xf32>)
+  attributes {iree.dispatch_fn_name = "parallel_4D"} {
+    linalg.generic
+      {args_in = 2 : i64, args_out = 1 : i64,
+       indexing_maps = [#map0, #map0, #map0],
+       iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
+      %arg0, %arg1, %arg2 {
+    ^bb0(%arg3 : f32, %arg4 : f32, %arg5 : f32):
+      %0 = addf %arg3, %arg4 : f32
+      linalg.yield %0 : f32
+    } : memref<?x?x?x?xf32>, memref<?x?x?x?xf32>, memref<?x?x?x?xf32>
+    return
+  }
+}
 
 // -----
 
@@ -60,52 +134,54 @@
     [Shader], [SPV_KHR_storage_buffer_storage_class]>,
     {max_compute_workgroup_invocations = 128 : i32,
      max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
-  func @matmul(%arg0: memref<?x?xf32>,
+  func @no_tile(%arg0: memref<?x?xf32>,
                 %arg1: memref<?x?xf32>,
                 %ret0: memref<?x?xf32>) {
-    linalg.matmul %arg0, %arg1, %ret0 :
+    linalg.matmul %arg0, %arg1, %ret0 {__internal_linalg_transform__ = "no-tile"} :
       (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>)
     return
   }
 }
-
-// CHECK-LABEL: func @matmul
-//  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9$._-]+]]: memref<?x?xf32>
-//  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9$._-]+]]: memref<?x?xf32>
-//  CHECK-SAME:   %[[ARG2:[a-zA-Z0-9$._-]+]]: memref<?x?xf32>
+// CHECK-LABEL: func @no_tile
 //  CHECK-SAME:   local_size = dense<[8, 8, 1]>
-//       CHECK:   scf.parallel (%{{.+}}, %{{.+}}, %{{.+}})
-//       CHECK:     %[[VIEW0:.+]] = subview %[[ARG0]]
-//       CHECK:     %[[VIEW1:.+]] = subview %[[ARG1]]
-//       CHECK:     %[[VIEW2:.+]] = subview %[[ARG2]]
-//       CHECK:     linalg.matmul
-//  CHECK-SAME:       "workitem"
-//  CHECK-SAME:       %[[VIEW0]], %[[VIEW1]], %[[VIEW2]]
+//   CHECK-NOT:   scf
+//       CHECK:   linalg.matmul
+//   CHECK-NOT:   scf
+//       CHECK:   return
 
 // -----
 
+#map0 = affine_map<() -> ()>
+#accesses = [#map0, #map0]
+#trait = {
+  args_in = 2 : i64,
+  args_out = 1 : i64,
+  indexing_maps = #accesses,
+  iterator_types = []
+}
+
 module attributes {
   spv.target_env =
     #spv.target_env<#spv.vce<v1.3,
     [Shader], [SPV_KHR_storage_buffer_storage_class]>,
     {max_compute_workgroup_invocations = 128 : i32,
      max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
-  func @pooling_sum_no_padding(%arg0 : memref<?x?xf32>, %arg1 : memref<?x?xf32>,
-                               %arg2 : memref<?x?xf32>) {
-    linalg.pooling_max(%arg0, %arg1, %arg2) {dilations = [1, 1], strides = [1, 1]} :
-      memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>
-    return
+  func @scalar_add(%arg0 : memref<f32>, %arg1 : memref<f32>,
+                   %arg2 : memref<f32>)
+  {
+    linalg.generic #trait %arg0, %arg1, %arg2 {
+    ^bb0(%arg3 : f32, %arg4 : f32, %arg5 : f32):
+      %0 = addf %arg3, %arg4 : f32
+      linalg.yield %0 : f32
+     } : memref<f32>, memref<f32>, memref<f32>
+     return
   }
 }
-
-// CHECK-LABEL: func @pooling_sum_no_padding
-//  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9$._-]+]]: memref<?x?xf32>
-//  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9$._-]+]]: memref<?x?xf32>
-//  CHECK-SAME:   %[[ARG2:[a-zA-Z0-9$._-]+]]: memref<?x?xf32>
-//  CHECK-SAME:   local_size = dense<[32, 4, 1]>
-//       CHECK:   scf.parallel (%{{.+}}, %{{.+}})
-//       CHECK:     %[[VIEW0:.+]] = subview %[[ARG0]]
-//       CHECK:     %[[VIEW2:.+]] = subview %[[ARG2]]
-//       CHECK:     linalg.pooling_max
-//  CHECK-SAME:       %[[VIEW0]], %[[ARG1]], %[[VIEW2]]
-//  CHECK-SAME:       "workitem"
+// CHECK-LABEL: func @scalar_add
+//   CHECK-NOT:   scf.parallel
+//   CHECK-NOT:   scf.for
+//       CHECK:   linalg.generic
+//  CHECK-SAME:     "no-tile"
+//   CHECK-NOT:   scf.parallel
+//   CHECK-NOT:   scf.for
+//       CHECK:   return

diff --git a/iree/compiler/Conversion/LinalgToSPIRV/test/split_dispatch_function.mlir b/iree/compiler/Conversion/LinalgToSPIRV/test/split_dispatch_function.mlir
index 637fd7b..81db628 100644
--- a/iree/compiler/Conversion/LinalgToSPIRV/test/split_dispatch_function.mlir
+++ b/iree/compiler/Conversion/LinalgToSPIRV/test/split_dispatch_function.mlir

@@ -46,65 +46,6 @@
 
 // -----
 
-// CHECK: module attributes {vkspv.entry_point_schedule = ["kernel_dispatch_0", "kernel_dispatch_1", "kernel_dispatch_2"]}
-module {
-  // CHECK: func @kernel_dispatch_2()
-  // CHECK:   %[[DIM:.+]] = hal.interface.load.constant
-  // CHECK:   %[[SHAPE1:.+]] = shapex.make_ranked_shape %[[DIM]]
-  // CHECK:   %[[SHAPE2:.+]] = shapex.make_ranked_shape %[[DIM]]
-  // CHECK:   %[[IN1:.+]] = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<?x2x2x512xf32>
-  // CHECK:   %[[TS1:.+]] = shapex.tie_shape %[[IN1]], %[[SHAPE1]]
-  // CHECK:   %[[IN2:.+]] = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg1} : memref<3x3x512x1xf32>
-  // CHECK:   %[[OUT:.+]] = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<?x1x1x512xf32>
-  // CHECK:   %[[TS2:.+]] = shapex.tie_shape %[[OUT]], %[[SHAPE2]]
-  // CHECK:   linalg.conv(%[[IN2]], %[[TS1]], %[[TS2]])
-  // CHECK:   return
-
-  // CHECK: func @kernel_dispatch_1() {
-  // CHECK:   %[[C0:.+]] = constant 0 : index
-  // CHECK:   %[[C1:.+]] = constant 1 : index
-  // CHECK:   scf.parallel (%{{.*}}) = (%[[C0]]) to (%[[C1]]) step (%[[C1]])
-  // CHECK:     scf.yield
-  // CHECK:   return
-
-  // CHECK: func @kernel_dispatch_0()
-  // CHECK:   %[[ZERO:.+]] = constant
-  // CHECK:   %[[DIM:.+]] = hal.interface.load.constant
-  // CHECK:   %[[SHAPE:.+]] = shapex.make_ranked_shape %[[DIM]]
-  // CHECK:   %[[OUT:.+]] = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<?x1x1x512xf32>
-  // CHECK:   %[[TS:.+]] = shapex.tie_shape %[[OUT]], %[[SHAPE]]
-  // CHECK:   linalg.fill(%[[TS]], %[[ZERO]])
-  // CHECK:   return
-
-  func @kernel() {
-    %cst = constant 0.000000e+00 : f32
-    %c0 = constant 0 : index
-    %c1 = constant 1 : index
-    %dim = hal.interface.load.constant offset = 0 : index
-    %shape1 = shapex.make_ranked_shape %dim : (index) -> !shapex.ranked_shape<[?,2,2,512]>
-    %shape2 = shapex.make_ranked_shape %dim : (index) -> !shapex.ranked_shape<[?,1,1,512]>
-    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<?x2x2x512xf32>
-    %ts1 = shapex.tie_shape %0, %shape1 : memref<?x2x2x512xf32>, !shapex.ranked_shape<[?,2,2,512]>
-    %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg1} : memref<3x3x512x1xf32>
-    %2 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<?x1x1x512xf32>
-    %ts2 = shapex.tie_shape %2, %shape2 : memref<?x1x1x512xf32>, !shapex.ranked_shape<[?,1,1,512]>
-    linalg.fill(%ts2, %cst) : memref<?x1x1x512xf32>, f32
-    scf.parallel (%iv) = (%c0) to (%c1) step (%c1) {
-      scf.yield
-    }
-    linalg.conv(%1, %ts1, %ts2) {dilations = [1, 1], padding = dense<[[0, 1], [0, 1]]> : tensor<2x2xi64>, strides = [2, 2]} : memref<3x3x512x1xf32>, memref<?x2x2x512xf32>, memref<?x1x1x512xf32>
-    return
-  }
-  hal.interface @legacy_io attributes {push_constants = 1 : i32, sym_visibility = "private"} {
-    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
-    hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
-    hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
-  }
-}
-
-
-// -----
-
 // Nothing to do if there is just one Linalg op.
 
 // CHECK-NOT: vkspv.entry_point_schedule
@@ -130,7 +71,7 @@
 // Do not split when Linalg and non-Linalg ops are interleaving each other.
 
 module {
-  // expected-error @+1 {{cannot separate Linalg/Parallel ops into multiple kernels}}
+  // expected-error @+1 {{cannot separate Linalg ops into multiple kernels}}
   func @kernel() {
     %cst = constant 0.000000e+00 : f32
     %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x2x2x512xf32>

diff --git a/iree/compiler/Conversion/LinalgToSPIRV/test/workgroup_memory_promotion.mlir b/iree/compiler/Conversion/LinalgToSPIRV/test/workgroup_memory_promotion.mlir
index 76cfcb8..060dc5a 100644
--- a/iree/compiler/Conversion/LinalgToSPIRV/test/workgroup_memory_promotion.mlir
+++ b/iree/compiler/Conversion/LinalgToSPIRV/test/workgroup_memory_promotion.mlir

@@ -5,7 +5,7 @@
     %arg0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<96x96xf32>
     %arg1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg1} : memref<96x96xf32>
     %arg2 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<96x96xf32>
-    linalg.matmul %arg0, %arg1, %arg2 :
+    linalg.matmul %arg0, %arg1, %arg2 {__internal_linalg_transform__ = "workgroup"} :
       (memref<96x96xf32>, memref<96x96xf32>, memref<96x96xf32>)
     return
   }

diff --git a/iree/hal/vmla/op_kernels.h b/iree/hal/vmla/op_kernels.h
index 093d52d..b7cb8b7 100644
--- a/iree/hal/vmla/op_kernels.h
+++ b/iree/hal/vmla/op_kernels.h

@@ -442,6 +442,6 @@
 }  // namespace iree
 
 #include "iree/hal/vmla/op_kernels_generic.h"  // IWYU pragma: export
-#include "iree/hal/vmla/op_kernels_ruy.h"      // IWYU pragma: export
+#include "iree/hal/vmla/op_kernels_ruy.h"  // IWYU pragma: export
 
 #endif  // IREE_HAL_VMLA_OP_KERNELS_H_

diff --git a/iree/hal/vulkan/BUILD b/iree/hal/vulkan/BUILD
index 8cf2112..d4c5753 100644
--- a/iree/hal/vulkan/BUILD
+++ b/iree/hal/vulkan/BUILD

@@ -37,6 +37,8 @@
     },
 )
 
+# TODO(antiagainst): expose configuration for emulated timeline semaphore
+
 cc_library(
     name = "api",
     srcs = ["api.cc"],
@@ -91,6 +93,7 @@
         ":pipeline_executable",
         ":status_util",
         ":vma_allocator",
+        "//iree/base:alignment",
         "//iree/base:arena",
         "//iree/base:math",
         "//iree/base:status",
@@ -187,6 +190,25 @@
 )
 
 cc_library(
+    name = "emulated_timeline_semaphore",
+    srcs = ["emulated_timeline_semaphore.cc"],
+    hdrs = ["emulated_timeline_semaphore.h"],
+    deps = [
+        ":handle_util",
+        ":status_util",
+        ":timepoint_util",
+        "//iree/base:intrusive_list",
+        "//iree/base:status",
+        "//iree/base:tracing",
+        "//iree/hal:semaphore",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        "@iree_vulkan_headers//:vulkan_headers_no_prototypes",
+    ],
+)
+
+cc_library(
     name = "extensibility_util",
     srcs = ["extensibility_util.cc"],
     hdrs = ["extensibility_util.h"],
@@ -326,6 +348,24 @@
 )
 
 cc_library(
+    name = "serializing_command_queue",
+    srcs = ["serializing_command_queue.cc"],
+    hdrs = ["serializing_command_queue.h"],
+    deps = [
+        ":direct_command_buffer",
+        ":emulated_timeline_semaphore",
+        ":handle_util",
+        ":status_util",
+        ":timepoint_util",
+        "//iree/base:status",
+        "//iree/base:tracing",
+        "//iree/hal:command_queue",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+cc_library(
     name = "status_util",
     srcs = ["status_util.cc"],
     hdrs = ["status_util.h"],
@@ -336,6 +376,21 @@
 )
 
 cc_library(
+    name = "timepoint_util",
+    srcs = ["timepoint_util.cc"],
+    hdrs = ["timepoint_util.h"],
+    deps = [
+        ":handle_util",
+        "//iree/base:intrusive_list",
+        "//iree/base:ref_ptr",
+        "//iree/base:status",
+        "//iree/base:tracing",
+        "@com_google_absl//absl/synchronization",
+        "@iree_vulkan_headers//:vulkan_headers_no_prototypes",
+    ],
+)
+
+cc_library(
     name = "vma_allocator",
     srcs = [
         "internal_vk_mem_alloc.cc",
@@ -384,6 +439,7 @@
         ":direct_command_buffer",
         ":direct_command_queue",
         ":dynamic_symbols",
+        ":emulated_timeline_semaphore",
         ":extensibility_util",
         ":handle_util",
         ":native_descriptor_set",
@@ -391,6 +447,7 @@
         ":native_timeline_semaphore",
         ":pipeline_cache",
         ":pipeline_executable_layout",
+        ":serializing_command_queue",
         ":status_util",
         ":vma_allocator",
         "//iree/base:math",

diff --git a/iree/hal/vulkan/CMakeLists.txt b/iree/hal/vulkan/CMakeLists.txt
index 1904a44..663437d 100644
--- a/iree/hal/vulkan/CMakeLists.txt
+++ b/iree/hal/vulkan/CMakeLists.txt

@@ -12,6 +12,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# TODO(antiagainst): We should probably always compiling the emulation in and
+# probe at runtime to enable if the device does not support native timeline
+# semaphore.
+option(IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORE
+       "Emulates timeline semaphore with binary semaphores and fences" OFF)
+
+# Unconditionally turn on emulated timleine semaphore for Android.
+if(CMAKE_CROSSCOMPILING AND "${CMAKE_SYSTEM_NAME}" MATCHES "Android")
+  set(IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORE ON CACHE BOOL "" FORCE)
+endif()
+# Unless we are not compiling Vulkan HAL backend in.
+if(NOT IREE_HAL_DRIVER_VULKAN)
+  set(IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORE OFF CACHE BOOL "" FORCE)
+endif()
+
+if(IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORE)
+  set(IREE_EMULATE_TIMELINE_SEMAPHORE 1)
+else()
+  set(IREE_EMULATE_TIMELINE_SEMAPHORE 0)
+endif()
+
 set(VMA_SRC_ROOT
   "${IREE_ROOT_DIR}/third_party/vulkan_memory_allocator/src/"
 )
@@ -86,6 +107,7 @@
   COPTS
     "-DVK_NO_PROTOTYPES"
   DEPS
+    iree::base::alignment
     iree::base::arena
     iree::base::math
     iree::base::status
@@ -198,6 +220,30 @@
 
 iree_cc_library(
   NAME
+    emulated_timeline_semaphore
+  HDRS
+    "emulated_timeline_semaphore.h"
+  SRCS
+    "emulated_timeline_semaphore.cc"
+  COPTS
+    "-DVK_NO_PROTOTYPES"
+  DEPS
+    ::handle_util
+    ::status_util
+    ::timepoint_util
+    absl::inlined_vector
+    absl::synchronization
+    absl::time
+    iree::base::intrusive_list
+    iree::base::status
+    iree::base::tracing
+    iree::hal::semaphore
+    Vulkan::Headers
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
     extensibility_util
   HDRS
     "extensibility_util.h"
@@ -224,9 +270,11 @@
   COPTS
     "-DVK_NO_PROTOTYPES"
   DEPS
+    absl::inlined_vector
     absl::synchronization
     absl::utility
     iree::base::ref_ptr
+    iree::hal::command_queue
     iree::hal::vulkan::dynamic_symbols
     iree::hal::vulkan::extensibility_util
     Vulkan::Headers
@@ -375,6 +423,30 @@
 
 iree_cc_library(
   NAME
+    serializing_command_queue
+  HDRS
+    "serializing_command_queue.h"
+  SRCS
+    "serializing_command_queue.cc"
+  COPTS
+    "-DVK_NO_PROTOTYPES"
+  DEPS
+    ::direct_command_buffer
+    ::emulated_timeline_semaphore
+    ::handle_util
+    ::status_util
+    ::timepoint_util
+    absl::inlined_vector
+    absl::synchronization
+    iree::base::status
+    iree::base::tracing
+    iree::hal::command_queue
+    Vulkan::Headers
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
     status_util
   HDRS
     "status_util.h"
@@ -390,6 +462,26 @@
 
 iree_cc_library(
   NAME
+    timepoint_util
+  HDRS
+    "timepoint_util.h"
+  SRCS
+    "timepoint_util.cc"
+  COPTS
+    "-DVK_NO_PROTOTYPES"
+  DEPS
+    ::handle_util
+    absl::synchronization
+    iree::base::intrusive_list
+    iree::base::ref_ptr
+    iree::base::status
+    iree::base::tracing
+    Vulkan::Headers
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
     vma_allocator
   HDRS
     "vma_allocator.h"
@@ -430,18 +522,21 @@
   SRCS
     "vulkan_device.cc"
   COPTS
+    "-DIREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES=${IREE_EMULATE_TIMELINE_SEMAPHORE}"
     "-DVK_NO_PROTOTYPES"
   DEPS
     ::descriptor_pool_cache
     ::direct_command_buffer
     ::direct_command_queue
     ::dynamic_symbols
+    ::emulated_timeline_semaphore
     ::extensibility_util
     ::handle_util
     ::native_descriptor_set
     ::native_timeline_semaphore
     ::pipeline_cache
     ::pipeline_executable_layout
+    ::serializing_command_queue
     ::status_util
     ::vma_allocator
     absl::inlined_vector
@@ -497,6 +592,7 @@
   SRCS
     "vulkan_driver_module.cc"
   COPTS
+    "-DIREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES=${IREE_EMULATE_TIMELINE_SEMAPHORE}"
     "-DVK_NO_PROTOTYPES"
   DEPS
     absl::flags

diff --git a/iree/hal/vulkan/descriptor_set_arena.cc b/iree/hal/vulkan/descriptor_set_arena.cc
index c51a410..54d8902 100644
--- a/iree/hal/vulkan/descriptor_set_arena.cc
+++ b/iree/hal/vulkan/descriptor_set_arena.cc

@@ -14,6 +14,7 @@
 
 #include "iree/hal/vulkan/descriptor_set_arena.h"
 
+#include "iree/base/alignment.h"
 #include "iree/base/math.h"
 #include "iree/base/tracing.h"
 #include "iree/hal/vulkan/status_util.h"
@@ -47,7 +48,24 @@
     buffer_info.buffer = buffer->handle();
     // TODO(benvanik): properly subrange (add to BufferBinding).
     buffer_info.offset = binding.buffer->byte_offset();
-    buffer_info.range = binding.buffer->byte_length();
+    // Round up to a multiple of 32-bit. 32-bit is the most native bitwidth on
+    // GPUs; it has the best support compared to other bitwidths. We use VMA to
+    // manage GPU memory for us and VMA should already handled proper alignment
+    // when performing allocations; here we just need to provide the proper
+    // "view" to Vulkan drivers over the allocated memory.
+    //
+    // Note this is needed because we can see unusal buffers like tensor<3xi8>.
+    // Depending on GPU capabilities, this might not always be directly
+    // supported by the hardware. Under such circumstances, we need to emulate
+    // i8 support with i32. Shader CodeGen takes care of that: the shader will
+    // read the buffer as tensor<i32> and perform bit shifts to extract each
+    // byte and conduct computations. The extra additional byte is read but
+    // not really used by the shader. Here in application we need to match the
+    // ABI and provide the buffer as 32-bit aligned, otherwise the whole read by
+    // the shader is considered as out of bounds per the Vulkan spec.
+    // See https://github.com/google/iree/issues/2022#issuecomment-640617234
+    // for more details.
+    buffer_info.range = iree_align(binding.buffer->byte_length(), 4);
 
     auto& write_info = write_infos[i];
     write_info.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;

diff --git a/iree/hal/vulkan/dynamic_symbols.cc b/iree/hal/vulkan/dynamic_symbols.cc
index 2083031..c8f6d23 100644
--- a/iree/hal/vulkan/dynamic_symbols.cc
+++ b/iree/hal/vulkan/dynamic_symbols.cc

@@ -76,7 +76,9 @@
                                       DEV_PFN_FUNCTION_PTR)};
 
 static const char* kVulkanLoaderSearchNames[] = {
-#if defined(IREE_PLATFORM_WINDOWS)
+#if defined(IREE_PLATFORM_ANDROID)
+    "libvulkan.so",
+#elif defined(IREE_PLATFORM_WINDOWS)
     "vulkan-1.dll",
 #else
     "libvulkan.so.1",

diff --git a/iree/hal/vulkan/dynamic_symbols.h b/iree/hal/vulkan/dynamic_symbols.h
index 4fc56e9..8983b0a 100644
--- a/iree/hal/vulkan/dynamic_symbols.h
+++ b/iree/hal/vulkan/dynamic_symbols.h

@@ -106,8 +106,8 @@
   // Each required and optional function in the loader tables will expand to
   // the following member, such as for example 'vkSomeFunction':
   //   PFN_vkSomeFunction vkSomeFunction;
-#define REQUIRED_PFN(function_name) PFN_##function_name function_name
-#define OPTIONAL_PFN(function_name) PFN_##function_name function_name
+#define REQUIRED_PFN(function_name) PFN_##function_name function_name = nullptr
+#define OPTIONAL_PFN(function_name) PFN_##function_name function_name = nullptr
 #define EXCLUDED_PFN(function_name)
 #define PFN_MEMBER(requirement, function_name) requirement##_PFN(function_name);
   REQUIRED_PFN(vkGetInstanceProcAddr);

diff --git a/iree/hal/vulkan/emulated_timeline_semaphore.cc b/iree/hal/vulkan/emulated_timeline_semaphore.cc
new file mode 100644
index 0000000..9656310
--- /dev/null
+++ b/iree/hal/vulkan/emulated_timeline_semaphore.cc

@@ -0,0 +1,322 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/hal/vulkan/emulated_timeline_semaphore.h"
+
+#include "absl/container/inlined_vector.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
+#include "absl/utility/utility.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/vulkan/dynamic_symbols.h"
+#include "iree/hal/vulkan/status_util.h"
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+
+// static
+StatusOr<ref_ptr<Semaphore>> EmulatedTimelineSemaphore::Create(
+    ref_ptr<VkDeviceHandle> logical_device,
+    std::function<Status(Semaphore*)> on_signal,
+    std::function<void(Semaphore*)> on_failure,
+    ref_ptr<TimePointSemaphorePool> semaphore_pool, uint64_t initial_value) {
+  IREE_TRACE_SCOPE0("EmulatedTimelineSemaphore::Create");
+  return make_ref<EmulatedTimelineSemaphore>(
+      std::move(logical_device), std::move(on_signal), std::move(on_failure),
+      std::move(semaphore_pool), initial_value);
+}
+
+EmulatedTimelineSemaphore::EmulatedTimelineSemaphore(
+    ref_ptr<VkDeviceHandle> logical_device,
+    std::function<Status(Semaphore*)> on_signal,
+    std::function<void(Semaphore*)> on_failure,
+    ref_ptr<TimePointSemaphorePool> semaphore_pool, uint64_t initial_value)
+    : signaled_value_(initial_value),
+      logical_device_(std::move(logical_device)),
+      on_signal_(std::move(on_signal)),
+      on_failure_(std::move(on_failure)),
+      semaphore_pool_(std::move(semaphore_pool)) {}
+
+EmulatedTimelineSemaphore::~EmulatedTimelineSemaphore() {
+  IREE_TRACE_SCOPE0("EmulatedTimelineSemaphore::dtor");
+  CHECK_OK(TryToAdvanceTimeline(UINT64_MAX).status());
+  absl::MutexLock lock(&mutex_);
+  CHECK(outstanding_semaphores_.empty())
+      << "Destroying an emulated timeline semaphore without first waiting on "
+         "outstanding signals";
+}
+
+StatusOr<uint64_t> EmulatedTimelineSemaphore::Query() {
+  RETURN_IF_ERROR(TryToAdvanceTimeline(UINT64_MAX).status());
+  uint64_t value = signaled_value_.load();
+  if (value == UINT64_MAX) {
+    absl::MutexLock lock(&mutex_);
+    return status_;
+  }
+  return value;
+}
+
+Status EmulatedTimelineSemaphore::Signal(uint64_t value) {
+  IREE_TRACE_SCOPE0("EmulatedTimelineSemaphore::Signal");
+  auto signaled_value = signaled_value_.exchange(value);
+  // Make sure the previous signaled value is smaller than the new value.
+  CHECK(signaled_value < value)
+      << "Attempting to signal a timeline value out of order; trying " << value
+      << " but " << signaled_value << " already signaled";
+
+  // Inform the device to make progress given we have a new value signaled now.
+  RETURN_IF_ERROR(on_signal_(this));
+
+  return OkStatus();
+}
+
+Status EmulatedTimelineSemaphore::Wait(uint64_t value, absl::Time deadline) {
+  IREE_TRACE_SCOPE0("EmulatedTimelineSemaphore::Wait");
+
+  VkFence fence = VK_NULL_HANDLE;
+  do {
+    IREE_TRACE_SCOPE0("EmulatedTimelineSemaphore::Wait#loop");
+    // First try to advance the timeline without blocking to see whether we've
+    // already reached the desired value.
+    ASSIGN_OR_RETURN(bool reached_desired_value, TryToAdvanceTimeline(value));
+    if (reached_desired_value) return OkStatus();
+
+    // We must wait now. Find the first emulated time point that has a value >=
+    // the desired value so we can wait on its associated signal fence to make
+    // sure the timeline is advanced to the desired value.
+    absl::MutexLock lock(&mutex_);
+    auto semaphore = outstanding_semaphores_.begin();
+    for (; semaphore != outstanding_semaphores_.end(); ++semaphore) {
+      if ((*semaphore)->value >= value) break;
+    }
+    if (semaphore != outstanding_semaphores_.end()) {
+      if (!(*semaphore)->signal_fence) {
+        return InternalErrorBuilder(IREE_LOC)
+               << "Timeline should have a signal fence for the first time "
+                  "point beyond the signaled value";
+      }
+      fence = (*semaphore)->signal_fence->value();
+      // Found; we can break the loop and proceed to waiting now.
+      break;
+    }
+    // TODO(antiagainst): figure out a better way instead of the busy loop here.
+  } while (absl::Now() < deadline);
+
+  if (fence == VK_NULL_HANDLE) {
+    return DeadlineExceededErrorBuilder(IREE_LOC)
+           << "Deadline reached when waiting timeline semaphore";
+  }
+
+  uint64_t timeout_nanos;
+  if (deadline == absl::InfiniteFuture()) {
+    timeout_nanos = UINT64_MAX;
+  } else if (deadline == absl::InfinitePast()) {
+    timeout_nanos = 0;
+  } else {
+    auto relative_nanos = absl::ToInt64Nanoseconds(deadline - absl::Now());
+    timeout_nanos = relative_nanos < 0 ? 0 : relative_nanos;
+  }
+
+  VK_RETURN_IF_ERROR(logical_device_->syms()->vkWaitForFences(
+      *logical_device_, /*fenceCount=*/1, &fence, /*waitAll=*/true,
+      timeout_nanos));
+
+  RETURN_IF_ERROR(TryToAdvanceTimeline(value).status());
+  return OkStatus();
+}
+
+void EmulatedTimelineSemaphore::Fail(Status status) {
+  IREE_TRACE_SCOPE0("EmulatedTimelineSemaphore::Fail");
+  absl::MutexLock lock(&mutex_);
+  status_ = std::move(status);
+  signaled_value_.store(UINT64_MAX);
+}
+
+VkSemaphore EmulatedTimelineSemaphore::GetWaitSemaphore(
+    uint64_t value, const ref_ptr<TimePointFence>& wait_fence) {
+  IREE_TRACE_SCOPE0("EmulatedTimelineSemaphore::GetWaitSemaphore");
+  absl::MutexLock lock(&mutex_);
+
+  VkSemaphore semaphore = VK_NULL_HANDLE;
+  for (TimePointSemaphore* point : outstanding_semaphores_) {
+    if (point->value > value && point->wait_fence) {
+      point->wait_fence = add_ref(wait_fence);
+      semaphore = point->semaphore;
+      break;
+    }
+  }
+
+  return semaphore;
+}
+
+Status EmulatedTimelineSemaphore::CancelWaitSemaphore(VkSemaphore semaphore) {
+  IREE_TRACE_SCOPE0("EmulatedTimelineSemaphore::CancelWaitSemaphore");
+  absl::MutexLock lock(&mutex_);
+  for (TimePointSemaphore* point : outstanding_semaphores_) {
+    if (point->semaphore != semaphore) continue;
+
+    if (!point->wait_fence) {
+      return InvalidArgumentErrorBuilder(IREE_LOC)
+             << "Time point wasn't waited before";
+    }
+    point->wait_fence = nullptr;
+    return OkStatus();
+  }
+  return InvalidArgumentErrorBuilder(IREE_LOC)
+         << "No time point for the given semaphore";
+}
+
+StatusOr<VkSemaphore> EmulatedTimelineSemaphore::GetSignalSemaphore(
+    uint64_t value, const ref_ptr<TimePointFence>& signal_fence) {
+  IREE_TRACE_SCOPE0("EmulatedTimelineSemaphore::GetSignalSemaphore");
+
+  if (signaled_value_.load() >= value) {
+    return FailedPreconditionErrorBuilder(IREE_LOC)
+           << "Timeline semaphore already signaled past " << value;
+  }
+
+  absl::MutexLock lock(&mutex_);
+
+  auto insertion_point = outstanding_semaphores_.begin();
+  while (insertion_point != outstanding_semaphores_.end()) {
+    if ((*insertion_point)->value > value) break;
+  }
+
+  ASSIGN_OR_RETURN(TimePointSemaphore * semaphore, semaphore_pool_->Acquire());
+  semaphore->value = value;
+  semaphore->signal_fence = add_ref(signal_fence);
+  if (semaphore->wait_fence) {
+    return InternalErrorBuilder(IREE_LOC)
+           << "Newly acquired time point semaphore should not have waiters";
+  }
+  outstanding_semaphores_.insert(insertion_point, semaphore);
+
+  return semaphore->semaphore;
+}
+
+StatusOr<bool> EmulatedTimelineSemaphore::TryToAdvanceTimeline(
+    uint64_t to_upper_value) {
+  IREE_TRACE_SCOPE0("EmulatedTimelineSemaphore::TryToAdvanceTimeline");
+
+  // We hold the lock during the entire resolve process so that we can resolve
+  // to the furthest possible value.
+  absl::MutexLock lock(&mutex_);
+
+  uint64_t past_value = signaled_value_.load();
+
+  // Fast path for when already signaled past the desired value.
+  if (past_value >= to_upper_value) return true;
+
+  // The timeline has not signaled past the desired value and there is no
+  // binary semaphore pending on GPU yet: certainly the timeline cannot
+  // advance to the desired value.
+  if (outstanding_semaphores_.empty()) return false;
+
+  IntrusiveList<TimePointSemaphore> resolved_semaphores;
+
+  bool keep_resolving = true;
+  bool reached_desired_value = false;
+  while (keep_resolving && !outstanding_semaphores_.empty()) {
+    auto* semaphore = outstanding_semaphores_.front();
+
+    // If the current semaphore is for a value beyond our upper limit, then
+    // early exit so that we don't spend time dealing with signals we don't yet
+    // care about. This can prevent live lock where one thread is signaling
+    // fences as fast/faster than another thread can consume them.
+    if (semaphore->value > to_upper_value) {
+      keep_resolving = false;
+      reached_desired_value = true;
+      break;
+    }
+
+    // If the current semaphore is for a value not greater than the past
+    // signaled value, then we know it was signaled previously. But there might
+    // be a waiter on it on GPU.
+    if (semaphore->value <= past_value) {
+      if (semaphore->signal_fence) {
+        return InternalErrorBuilder(IREE_LOC)
+               << "Timeline should already signaled past this time point and "
+                  "cleared the signal fence";
+      }
+
+      // If ther is no waiters, we can recycle this semaphore now. If there
+      // exists one waiter, then query its status and recycle on success. We
+      // only handle success status here. Others will be handled when the fence
+      // is checked for other semaphores' signaling status for the same queue
+      // submission.
+      if (!semaphore->wait_fence ||
+          semaphore->wait_fence->GetStatus() == VK_SUCCESS) {
+        semaphore->signal_fence = nullptr;
+        semaphore->wait_fence = nullptr;
+        outstanding_semaphores_.erase(semaphore);
+        resolved_semaphores.push_back(semaphore);
+      }
+
+      continue;
+    }
+
+    // This semaphore represents a value gerater than the known previously
+    // signaled value. We don't know its status so we need to really query now.
+
+    if (!semaphore->signal_fence) {
+      return InternalErrorBuilder(IREE_LOC)
+             << "The status of this time point in the timeline should still be "
+                "pending with a singal fence";
+    }
+    VkResult signal_status = semaphore->signal_fence->GetStatus();
+
+    switch (signal_status) {
+      case VK_SUCCESS:
+        signaled_value_.store(semaphore->value);
+        semaphore->signal_fence = nullptr;
+        // If no waiters, we can recycle this semaphore now.
+        if (!semaphore->wait_fence) {
+          semaphore->signal_fence = nullptr;
+          semaphore->wait_fence = nullptr;
+          outstanding_semaphores_.erase(semaphore);
+          resolved_semaphores.push_back(semaphore);
+        }
+        break;
+      case VK_NOT_READY:
+        // The fence has not been signaled yet so this is the furthest time
+        // point we can go in this timeline.
+        keep_resolving = false;
+        break;
+      default:
+        // Fence indicates an error (device lost, out of memory, etc).
+        // Propagate this back to our status (and thus any waiters).
+        // Since we only take the first error we find we skip all remaining
+        // fences.
+        keep_resolving = false;
+        semaphore->signal_fence = nullptr;
+        status_ = VkResultToStatus(signal_status);
+        signaled_value_.store(UINT64_MAX);
+        break;
+    }
+  }
+
+  semaphore_pool_->ReleaseResolved(&resolved_semaphores);
+  if (!status_.ok()) {
+    on_failure_(this);
+    semaphore_pool_->ReleaseUnresolved(&outstanding_semaphores_);
+    return status_;
+  }
+
+  return reached_desired_value;
+}
+
+}  // namespace vulkan
+}  // namespace hal
+}  // namespace iree

diff --git a/iree/hal/vulkan/emulated_timeline_semaphore.h b/iree/hal/vulkan/emulated_timeline_semaphore.h
new file mode 100644
index 0000000..cc13a09
--- /dev/null
+++ b/iree/hal/vulkan/emulated_timeline_semaphore.h

@@ -0,0 +1,223 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_HAL_VULKAN_ENUMLATED_TIMELINE_SEMAPHORE_H_
+#define IREE_HAL_VULKAN_ENUMLATED_TIMELINE_SEMAPHORE_H_
+
+#include <vulkan/vulkan.h>
+
+#include <atomic>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/synchronization/mutex.h"
+#include "iree/base/intrusive_list.h"
+#include "iree/base/ref_ptr.h"
+#include "iree/base/status.h"
+#include "iree/hal/semaphore.h"
+#include "iree/hal/vulkan/handle_util.h"
+#include "iree/hal/vulkan/timepoint_util.h"
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+
+// A timeline semaphore emulated via `VkFence`s and binary `VkSemaphore`s.
+//
+// Vulkan provides several explicit synchronization primitives: fences,
+// (binary/timeline) semaphores, events, pipeline barriers, and render passes.
+// See "6. Synchronization and Cache Control" of the Vulkan specification
+// for the details.
+//
+// Render passes are for graphics pipelines so IREE does not care about them.
+// Pipeline barriers synchronize control within a command buffer at a single
+// point. Fences, (binary/timeline) semaphores, and events are synchronization
+// primitives that have separate signal and wait operations. Events are more
+// fine-grained compared to fences and semaphores given that they can be
+// signaled or waited within a command buffer while fences and semaphores are
+// at queue submissions. Each of them have its usage requirements:
+//
+// * Fences must be signaled on GPU and waited on CPU. Fences must be reset
+//   before reuse.
+// * Binary semaphores must be signaled on GPU and waited on GPU. They do not
+//   support wait-before-signal submission order. More importantly, binary
+//   semaphore wait also unsignals the semaphore. So binary semaphore signals
+//   and waits should occur in discrete 1:1 pairs.
+// * Timeline semaphores can be signaled on CPU or GPU and waited on CPU or GPU.
+//   They support wait-before-signal submission order. Timeline semaphores do
+//   not need to be reset.
+//
+// It's clear that timeline semaphore is more flexible than fences and binary
+// semaphores: it unifies GPU and CPU synchronization with a single primitive.
+// But it's not always available: it requires the VK_KHR_timeline_semaphore
+// or Vulkan 1.2. When it's not available, it can be emulated via `VkFence`s
+// and binary `VkSemaphore`s. The emulation need to provide the functionality of
+// timeline semaphores and also not violate the usage requirements of `VkFence`s
+// and binary `VkSemaphore`s.
+//
+// The basic idea is to create a timeline object with time points to emulate the
+// timeline semaphore, which consists of a monotonically increasing 64-bit
+// integer value. Each time point represents a specific signaled/waited integer
+// value of the timeline semaphore; each time point can associate with binary
+// `VkSemaphore`s and/or `VkFence`s for emulating the synchronization.
+//
+// Concretely, for each of the possible signal -> wait scenarios timeline
+// semaphore supports:
+//
+// ### GPU -> GPU (via `vkQueueSubmit`)
+//
+// Each `vkQueueSubmit` can attach a `VkTimelineSemaphoreSubmitInfo` to describe
+// the timeline semaphore values signaled and waited. Each of the signaled value
+// will be a time point and emulated by a binary `VkSemaphore`. We submit the
+// binary `VkSemahpore`s to the GPU under the hood. For the waited values, the
+// situation is more complicated because of the differences between binary and
+// timeline semaphores:
+//
+// * Binary semaphore signal-wait relationship is strictly 1:1, unlike timeline
+//   semaphore where we can have 1:N cases. This means for a specific binary
+//   `VkSemaphore` used to emulate a signaled time point, we can have at most
+//   one subsequent `vkQueueSubmit` waits on it. We need other mechanisms for
+//   additional waits. A simple way is to involve the CPU and don't sumbit
+//   the additional work to queue until the desired value is already signaled
+//   past. This requires `VkFence`s for letting the CPU know the status of
+//   GPU progress, but `VkFence` is needed anyway because of GPU -> CPU
+//   synchronization.
+// * Binary semaphores does not support wait-before-signal submission order.
+//   This means we need to put the submission into a self-managed queue if the
+//   binary semaphores used to emulate the time points waited by the submission
+//   are not submitted to GPU yet.
+//
+// ### GPU -> CPU (via `vkWaitSemaphores`)
+//
+// Without timeline semaphore, we need to use fences to let CPU wait on GPU
+// progress. So this direction can be emulated by `vkWaitFences`. It means we
+// need to associate a `VkFence` with the given waited timeline semaphores.
+// Because we don't know whether a particular `vkQueueSubmit` with timeline
+// semaphores will be later waited on by CPU beforehand, we need to bundle each
+// of them with a `VkFence` just in case they will be waited on later.
+//
+// ### CPU -> GPU (via `vkSignalSemaphore`)
+//
+// This direction can be handled by bumping the signaled timeline value and
+// scan the self-managed queue to submit more work to GPU if possible.
+//
+// ### CPU -> CPU (via `vkWaitSemaphores`)
+//
+// This is similar to CPU -> GPU direction; we just need to enable other threads
+// on CPU side and let them progress.
+//
+// The implementation is inspired by the Vulkan-ExtensionLayer project:
+// https://github.com/KhronosGroup/Vulkan-ExtensionLayer. We don't handle all
+// the aspects of the full spec though given that IREE only uses a subset of
+// synchronization primitives. So this should not be treated as a full
+// emulation of the Vulkan spec and thus does not substitute
+// Vulkan-ExtensionLayer.
+class EmulatedTimelineSemaphore final : public Semaphore {
+ public:
+  // Creates a timeline semaphore with the given |initial_value|.
+  static StatusOr<ref_ptr<Semaphore>> Create(
+      ref_ptr<VkDeviceHandle> logical_device,
+      std::function<Status(Semaphore*)> on_signal,
+      std::function<void(Semaphore*)> on_failure,
+      ref_ptr<TimePointSemaphorePool> semaphore_pool, uint64_t initial_value);
+
+  EmulatedTimelineSemaphore(ref_ptr<VkDeviceHandle> logical_device,
+                            std::function<Status(Semaphore*)> on_signal,
+                            std::function<void(Semaphore*)> on_failure,
+                            ref_ptr<TimePointSemaphorePool> semaphore_pool,
+                            uint64_t initialValue);
+
+  ~EmulatedTimelineSemaphore() override;
+
+  StatusOr<uint64_t> Query() override;
+
+  Status Signal(uint64_t value) override;
+
+  Status Wait(uint64_t value, absl::Time deadline) override;
+
+  void Fail(Status status) override;
+
+  // Gets a binary semaphore for waiting on the timeline to advance to the given
+  // |value|. The semaphore returned won't be waited by anyone else. Returns
+  // VK_NULL_HANDLE if no available semaphores for the given |value|.
+  // |wait_fence| is the fence associated with the queue submission that waiting
+  // on this semaphore.
+  VkSemaphore GetWaitSemaphore(uint64_t value,
+                               const ref_ptr<TimePointFence>& wait_fence);
+
+  // Cancels the waiting attempt on the given binary |semaphore|. This allows
+  // the |semaphore| to be waited by others.
+  Status CancelWaitSemaphore(VkSemaphore semaphore);
+
+  // Gets a binary semaphore for signaling the timeline to the given |value|.
+  // |value| must be smaller than the current timeline value. |signal_fence| is
+  // the fence associated with the queue submission that signals this semaphore.
+  StatusOr<VkSemaphore> GetSignalSemaphore(
+      uint64_t value, const ref_ptr<TimePointFence>& signal_fence);
+
+ private:
+  // Tries to advance the timeline to the given |to_upper_value| without
+  // blocking and returns whether the |to_upper_value| is reached.
+  StatusOr<bool> TryToAdvanceTimeline(uint64_t to_upper_value)
+      ABSL_LOCKS_EXCLUDED(mutex_);
+
+  std::atomic<uint64_t> signaled_value_;
+
+  ref_ptr<VkDeviceHandle> logical_device_;
+
+  // Callback to inform that this timeline semaphore has signaled a new value.
+  std::function<Status(Semaphore*)> on_signal_;
+
+  // Callback to inform that this timeline semaphore has encountered a failure.
+  std::function<void(Semaphore*)> on_failure_;
+
+  ref_ptr<TimePointSemaphorePool> semaphore_pool_;
+
+  mutable absl::Mutex mutex_;
+
+  // A list of outstanding semaphores used to emulate time points.
+  //
+  // The life time of each semaphore is in one of the following state:
+  //
+  // * Unused state: value = UINT64_MAX, signal/wait fence = nullptr. This is
+  //   the state of the semaphore when it's initially acquired from the pool and
+  //   not put in the queue for emulating a time point yet.
+  // * Pending state: signaled value < value < UINT64_MAX, signal fence =
+  //   <some-fence>, wait fence == nullptr. This is the state of the semaphore
+  //   when it's put into the GPU queue for emulating a time point.
+  // * Pending and waiting state: signaled value < value < UINT64_MAX, signal
+  //   fence = <some-fence>, wait fence == <some-fence>. This is the state of
+  //   the semaphore when it's put into the GPU queue for emulating a time
+  //   point and there is another queue submission waiting on it in GPU.
+  // * Signaled and not ever waited state: value <= signaled value, singal/wait
+  //   fence = nullptr. This is the state of the semaphore when we know it's
+  //   already signaled on GPU and there is no waiters for it.
+  // * Signaled and waiting state: value <= signaled value, signal fence =
+  //   nullptr, wait fence = <some-fence>. This is the state of the semaphore
+  //   when we know it's already signaled on GPU and there is still one queue
+  //   submission on GPU is waiting for it.
+  IntrusiveList<TimePointSemaphore> outstanding_semaphores_
+      ABSL_GUARDED_BY(mutex_);
+
+  // NOTE: We only need to access this status (and thus take the lock) when we
+  // want to either signal failure or query the status in the case of the
+  // semaphore being set to UINT64_MAX.
+  Status status_ ABSL_GUARDED_BY(mutex_);
+};
+
+}  // namespace vulkan
+}  // namespace hal
+}  // namespace iree
+
+#endif  // IREE_HAL_VULKAN_ENUMLATED_TIMELINE_SEMAPHORE_H_

diff --git a/iree/hal/vulkan/serializing_command_queue.cc b/iree/hal/vulkan/serializing_command_queue.cc
new file mode 100644
index 0000000..9d6d24c
--- /dev/null
+++ b/iree/hal/vulkan/serializing_command_queue.cc

@@ -0,0 +1,355 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/hal/vulkan/serializing_command_queue.h"
+
+#include <memory>
+
+#include "absl/time/clock.h"
+#include "absl/types/span.h"
+#include "iree/base/memory.h"
+#include "iree/base/source_location.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/command_buffer.h"
+#include "iree/hal/command_queue.h"
+#include "iree/hal/semaphore.h"
+#include "iree/hal/vulkan/direct_command_buffer.h"
+#include "iree/hal/vulkan/emulated_timeline_semaphore.h"
+#include "iree/hal/vulkan/status_util.h"
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+
+namespace {
+
+// Tries to prepare all necessary binary `VKSemaphore`s for emulating the time
+// points as specified in the given submission |batch| and returns true if
+// possible so that the |batch| is ready to be submitted to GPU.
+// |wait_semaphores| and |signal_semaphores| will be filled with the binary
+// `VkSemaphores` on success. |fence| is the fence associated with the
+// submission |batch|.
+StatusOr<bool> TryToPrepareSemaphores(
+    const SubmissionBatch& batch, const ref_ptr<TimePointFence>& fence,
+    absl::InlinedVector<VkSemaphore, 4>* wait_semaphores,
+    absl::InlinedVector<VkSemaphore, 4>* signal_semaphores) {
+  IREE_TRACE_SCOPE0("TryToPrepareSemaphores");
+
+  wait_semaphores->clear();
+  for (const auto& timeline_semaphore : batch.wait_semaphores) {
+    // Query first to progress this timeline semaphore to the furthest.
+    ASSIGN_OR_RETURN(auto signaled_value,
+                     timeline_semaphore.semaphore->Query());
+
+    // If it's already signaled to a value greater than we require here,
+    // we can just ignore this semaphore now.
+    if (signaled_value >= timeline_semaphore.value) continue;
+
+    // SerializingCommandQueue only works with EmulatedTimelineSemaphore.
+    auto* emulated_semaphore =
+        static_cast<EmulatedTimelineSemaphore*>(timeline_semaphore.semaphore);
+
+    // Otherwise try to get a binary semaphore for this time point so that
+    // we can wait on.
+    VkSemaphore binary_semaphore =
+        emulated_semaphore->GetWaitSemaphore(timeline_semaphore.value, fence);
+
+    if (binary_semaphore == VK_NULL_HANDLE) {
+      // We cannot wait on this time point yet: there are no previous semaphores
+      // submitted to the GPU  that can signal a value greater than what's
+      // desired here.
+
+      // Cancel the wait so others may make progress.
+      for (VkSemaphore semaphore : *wait_semaphores) {
+        RETURN_IF_ERROR(emulated_semaphore->CancelWaitSemaphore(semaphore));
+      }
+
+      // This batch cannot be submitted to GPU yet.
+      return false;
+    }
+
+    wait_semaphores->push_back(binary_semaphore);
+  }
+
+  // We've collected all necessary binary semaphores for each timeline we need
+  // to wait on. Now prepare binary semaphores for signaling.
+  signal_semaphores->clear();
+  for (const auto& timeline_semaphore : batch.signal_semaphores) {
+    // SerializingCommandQueue only works with EmulatedTimelineSemaphore.
+    auto* emulated_semaphore =
+        static_cast<EmulatedTimelineSemaphore*>(timeline_semaphore.semaphore);
+
+    ASSIGN_OR_RETURN(auto binary_semaphore,
+                     emulated_semaphore->GetSignalSemaphore(
+                         timeline_semaphore.value, fence));
+    signal_semaphores->push_back(binary_semaphore);
+  }
+
+  // Good to submit!
+  return true;
+}
+
+// Prepares `VkSubmitInfo` to submit the given list of |command_buffers| that
+// waiting on |wait_semaphores| and signalling |signal_semaphores|. Necessary
+// structures are allocated from |arena| and the result `VkSubmitInfo` is
+// written to |submit_info|.
+void PrepareSubmitInfo(
+    const absl::InlinedVector<VkSemaphore, 4>& wait_semaphores,
+    absl::Span<CommandBuffer* const> command_buffers,
+    const absl::InlinedVector<VkSemaphore, 4>& signal_semaphores,
+    VkSubmitInfo* submit_info, Arena* arena) {
+  IREE_TRACE_SCOPE0("PrepareSubmitInfo");
+
+  // TODO(benvanik): see if we can go to finer-grained stages.
+  // For example, if this was just queue ownership transfers then we can use
+  // the pseudo-stage of VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT.
+  VkPipelineStageFlags dst_stage_mask =
+      VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
+
+  auto wait_semaphore_handles =
+      arena->AllocateSpan<VkSemaphore>(wait_semaphores.size());
+  auto wait_dst_stage_masks =
+      arena->AllocateSpan<VkPipelineStageFlags>(wait_semaphores.size());
+  for (int i = 0, e = wait_semaphores.size(); i < e; ++i) {
+    wait_semaphore_handles[i] = wait_semaphores[i];
+    wait_dst_stage_masks[i] = dst_stage_mask;
+  }
+
+  auto signal_semaphore_handles =
+      arena->AllocateSpan<VkSemaphore>(signal_semaphores.size());
+  for (int i = 0, e = signal_semaphores.size(); i < e; ++i) {
+    signal_semaphore_handles[i] = signal_semaphores[i];
+  }
+
+  auto command_buffer_handles =
+      arena->AllocateSpan<VkCommandBuffer>(command_buffers.size());
+  for (int i = 0, e = command_buffers.size(); i < e; ++i) {
+    const auto& command_buffer = command_buffers[i];
+    auto* direct_command_buffer =
+        static_cast<DirectCommandBuffer*>(command_buffer->impl());
+    command_buffer_handles[i] = direct_command_buffer->handle();
+  }
+
+  submit_info->sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+  submit_info->pNext = nullptr;
+  submit_info->waitSemaphoreCount = wait_semaphore_handles.size();
+  submit_info->pWaitSemaphores = wait_semaphore_handles.data();
+  submit_info->pWaitDstStageMask = wait_dst_stage_masks.data();
+  submit_info->commandBufferCount = command_buffer_handles.size();
+  submit_info->pCommandBuffers = command_buffer_handles.data();
+  submit_info->signalSemaphoreCount = signal_semaphore_handles.size();
+  submit_info->pSignalSemaphores = signal_semaphore_handles.data();
+}
+
+}  // namespace
+
+SerializingCommandQueue::SerializingCommandQueue(
+    std::string name, CommandCategoryBitfield supported_categories,
+    const ref_ptr<VkDeviceHandle>& logical_device,
+    const ref_ptr<TimePointFencePool>& fence_pool, VkQueue queue)
+    : CommandQueue(std::move(name), supported_categories),
+      logical_device_(add_ref(logical_device)),
+      fence_pool_(add_ref(fence_pool)),
+      queue_(queue) {}
+
+SerializingCommandQueue::~SerializingCommandQueue() {
+  IREE_TRACE_SCOPE0("SerializingCommandQueue::dtor");
+  absl::MutexLock lock(&mutex_);
+  syms()->vkQueueWaitIdle(queue_);
+}
+
+Status SerializingCommandQueue::Submit(
+    absl::Span<const SubmissionBatch> batches) {
+  IREE_TRACE_SCOPE0("SerializingCommandQueue::Submit");
+
+  absl::MutexLock lock(&mutex_);
+  for (const auto& batch : batches) {
+    // Grab a fence for this submission first. This will be used to check the
+    // progress of emulated timeline semaphores later.
+    ASSIGN_OR_RETURN(auto fence, fence_pool_->Acquire());
+    deferred_submissions_.push_back(
+        std::make_unique<FencedSubmission>(batch, std::move(fence)));
+  }
+
+  return ProcessDeferredSubmissions().status();
+}
+
+StatusOr<bool> SerializingCommandQueue::ProcessDeferredSubmissions() {
+  IREE_TRACE_SCOPE0("SerializingCommandQueue::ProcessDeferredSubmissions");
+
+  // Prepare `VkSubmitInfo`s for all submissions we are able to submit.
+
+  // Note that we must keep all arrays referenced alive until submission
+  // completes and since there are a bunch of them we use an arena.
+  Arena arena(4 * 1024);
+
+  absl::InlinedVector<VkSubmitInfo, 4> submit_infos;
+  absl::InlinedVector<VkFence, 4> submit_fences;
+
+  absl::InlinedVector<VkSemaphore, 4> wait_semaphores;
+  absl::InlinedVector<VkSemaphore, 4> signal_semaphores;
+
+  // A list of submissions that still needs to be deferred.
+  IntrusiveList<std::unique_ptr<FencedSubmission>> remaining_submissions;
+
+  while (!deferred_submissions_.empty()) {
+    wait_semaphores.clear();
+    signal_semaphores.clear();
+
+    auto submission = deferred_submissions_.take(deferred_submissions_.front());
+    const SubmissionBatch& batch = submission->batch;
+    ref_ptr<TimePointFence> fence(std::move(submission->fence));
+
+    ASSIGN_OR_RETURN(bool ready_to_submit,
+                     TryToPrepareSemaphores(batch, fence, &wait_semaphores,
+                                            &signal_semaphores));
+
+    if (ready_to_submit) {
+      submit_infos.emplace_back();
+      PrepareSubmitInfo(wait_semaphores, batch.command_buffers,
+                        signal_semaphores, &submit_infos.back(), &arena);
+      submit_fences.push_back(fence->value());
+      pending_fences_.emplace_back(std::move(fence));
+    } else {
+      // We need to defer the submission until later.
+      remaining_submissions.push_back(std::move(submission));
+    }
+  }
+
+  if (submit_infos.empty()) return false;
+
+  auto infos = arena.AllocateSpan<VkSubmitInfo>(submit_infos.size());
+  for (int i = 0, e = submit_infos.size(); i < e; ++i) {
+    infos[i] = submit_infos[i];
+  }
+
+  // Note: We might be able to batch the submission but it involves non-trivial
+  // fence handling. We can handle that if really needed.
+  for (int i = 0, e = submit_infos.size(); i < e; ++i) {
+    VK_RETURN_IF_ERROR(syms()->vkQueueSubmit(
+        queue_, /*submitCount=*/1, &submit_infos[i], submit_fences[i]));
+  }
+
+  while (!remaining_submissions.empty()) {
+    deferred_submissions_.push_back(
+        remaining_submissions.take(remaining_submissions.front()));
+  }
+
+  return true;
+}
+
+Status SerializingCommandQueue::WaitIdle(absl::Time deadline) {
+  absl::MutexLock lock(&mutex_);
+
+  if (deadline == absl::InfiniteFuture()) {
+    IREE_TRACE_SCOPE0("SerializingCommandQueue::WaitIdle#vkQueueWaitIdle");
+    // Fast path for using vkQueueWaitIdle, which is usually cheaper (as it
+    // requires fewer calls into the driver).
+
+    // Complete all pending work on the queue.
+    VK_RETURN_IF_ERROR(syms()->vkQueueWaitIdle(queue_));
+    pending_fences_.clear();
+
+    // Submit and complete all deferred work.
+    while (!deferred_submissions_.empty()) {
+      ASSIGN_OR_RETURN(bool work_submitted, ProcessDeferredSubmissions());
+      if (work_submitted) {
+        VK_RETURN_IF_ERROR(syms()->vkQueueWaitIdle(queue_));
+        pending_fences_.clear();
+      }
+    }
+
+    return OkStatus();
+  }
+
+  IREE_TRACE_SCOPE0("SerializingCommandQueue::WaitIdle#Fence");
+
+  // Keep trying to submit more workload to the GPU until reaching the deadline.
+  do {
+    RETURN_IF_ERROR(ProcessDeferredSubmissions().status());
+
+    uint64_t timeout_nanos;
+    if (deadline == absl::InfinitePast()) {
+      // Do not wait.
+      timeout_nanos = 0;
+    } else {
+      // Convert to relative time in nanoseconds.
+      // The implementation may not wait with this granularity (like, by
+      // 10000x).
+      absl::Time now = absl::Now();
+      if (deadline < now) {
+        return DeadlineExceededErrorBuilder(IREE_LOC)
+               << "Deadline exceeded waiting for idle";
+      }
+      timeout_nanos =
+          static_cast<uint64_t>(absl::ToInt64Nanoseconds(deadline - now));
+    }
+
+    if (pending_fences_.empty()) continue;
+
+    std::vector<VkFence> fences;
+    fences.reserve(pending_fences_.size());
+    for (const auto& fence : pending_fences_) fences.push_back(fence->value());
+
+    VkResult result =
+        syms()->vkWaitForFences(*logical_device_, fences.size(), fences.data(),
+                                /*waitAll=*/VK_TRUE, timeout_nanos);
+
+    switch (result) {
+      case VK_SUCCESS:
+        pending_fences_.clear();
+        break;
+      case VK_TIMEOUT:
+        return DeadlineExceededErrorBuilder(IREE_LOC)
+               << "Deadline exceeded waiting for idle";
+      default:
+        return VkResultToStatus(result);
+    }
+    // As long as there is submitted or deferred work still pending.
+  } while (!pending_fences_.empty() || !deferred_submissions_.empty());
+
+  return OkStatus();
+}
+
+Status SerializingCommandQueue::AdvanceQueueSubmission() {
+  absl::MutexLock lock(&mutex_);
+  // The returned value just indicates whether there were newly ready
+  // submissions gotten submitted to the GPU. Other callers might be
+  // interested in that information but for this API we just want to advance
+  // queue submisison if possible. So we ignore it here.
+  ASSIGN_OR_RETURN(std::ignore, ProcessDeferredSubmissions());
+  return OkStatus();
+}
+
+void SerializingCommandQueue::AbortQueueSubmission() {
+  absl::MutexLock lock(&mutex_);
+
+  // We have fences in deferred_submissions_ but they are not submitted to GPU
+  // yet so we don't need to reset.
+  deferred_submissions_.clear();
+
+  std::vector<VkFence> fences;
+  fences.reserve(pending_fences_.size());
+  for (const auto& fence : pending_fences_) fences.push_back(fence->value());
+
+  syms()->vkWaitForFences(*logical_device_, fences.size(), fences.data(),
+                          /*waitAll=*/VK_TRUE, /*timeout=*/UINT64_MAX);
+  // Clear the list. Fences will be automatically returned back to the queue
+  // after refcount reaches 0.
+  pending_fences_.clear();
+}
+
+}  // namespace vulkan
+}  // namespace hal
+}  // namespace iree

diff --git a/iree/hal/vulkan/serializing_command_queue.h b/iree/hal/vulkan/serializing_command_queue.h
new file mode 100644
index 0000000..e38643b
--- /dev/null
+++ b/iree/hal/vulkan/serializing_command_queue.h

@@ -0,0 +1,111 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_HAL_VULKAN_SERIALIZING_COMMAND_QUEUE_H_
+#define IREE_HAL_VULKAN_SERIALIZING_COMMAND_QUEUE_H_
+
+#include <vulkan/vulkan.h>
+
+#include <memory>
+#include <string>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
+#include "iree/base/intrusive_list.h"
+#include "iree/base/ref_ptr.h"
+#include "iree/base/status.h"
+#include "iree/hal/command_buffer.h"
+#include "iree/hal/command_queue.h"
+#include "iree/hal/vulkan/dynamic_symbols.h"
+#include "iree/hal/vulkan/handle_util.h"
+#include "iree/hal/vulkan/timepoint_util.h"
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+
+// A command queue that potentially defers and serializes command buffer
+// submission to the GPU.
+//
+// This command queue is designed to be used together with emulated timeline
+// semaphores. Timeline semaphores can follow wait-before-signal submission
+// order but binary `VkSemaphore` cannot. So when emulating timeline semaphores
+// with binary `VkSemaphore`s and `VkFence`s, we need to make sure no
+// wait-before-signal submission order occur for binary `VkSemaphore`s. The way
+// to enforce that is to defer the submission until we can be certain that the
+// `VkSemaphore`s emulating time points in the timeline are all *submitted* to
+// the GPU.
+class SerializingCommandQueue final : public CommandQueue {
+ public:
+  SerializingCommandQueue(std::string name,
+                          CommandCategoryBitfield supported_categories,
+                          const ref_ptr<VkDeviceHandle>& logical_device,
+                          const ref_ptr<TimePointFencePool>& fence_pool,
+                          VkQueue queue);
+  ~SerializingCommandQueue() override;
+
+  const ref_ptr<DynamicSymbols>& syms() const {
+    return logical_device_->syms();
+  }
+
+  Status Submit(absl::Span<const SubmissionBatch> batches) override;
+
+  Status WaitIdle(absl::Time deadline) override;
+
+  // Releases all deferred submissions ready to submit to the GPU.
+  Status AdvanceQueueSubmission();
+
+  // Aborts all deferred submissions and waits for submitted work to complete.
+  void AbortQueueSubmission();
+
+ private:
+  // A submission batch together with the fence to singal its status.
+  struct FencedSubmission : IntrusiveLinkBase<void> {
+    SubmissionBatch batch;
+    ref_ptr<TimePointFence> fence;
+
+    FencedSubmission(const SubmissionBatch& batch,
+                     ref_ptr<TimePointFence> fence)
+        : batch(batch), fence(std::move(fence)) {}
+  };
+
+  // Processes deferred submissions in this queue and returns whether there are
+  // new workload submitted to the GPU if no errors happen.
+  StatusOr<bool> ProcessDeferredSubmissions()
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  ref_ptr<VkDeviceHandle> logical_device_;
+
+  ref_ptr<TimePointFencePool> fence_pool_;
+
+  mutable absl::Mutex mutex_;
+
+  // A list of fences that are submitted to GPU.
+  absl::InlinedVector<ref_ptr<TimePointFence>, 4> pending_fences_
+      ABSL_GUARDED_BY(mutex_);
+  // A list of deferred submissions that haven't been submitted to GPU.
+  IntrusiveList<std::unique_ptr<FencedSubmission>> deferred_submissions_
+      ABSL_GUARDED_BY(mutex_);
+
+  // VkQueue needs to be externally synchronized.
+  VkQueue queue_ ABSL_GUARDED_BY(mutex_);
+};
+
+}  // namespace vulkan
+}  // namespace hal
+}  // namespace iree
+
+#endif  // IREE_HAL_VULKAN_SERIALIZING_COMMAND_QUEUE_H_

diff --git a/iree/hal/vulkan/timepoint_util.cc b/iree/hal/vulkan/timepoint_util.cc
new file mode 100644
index 0000000..c212856
--- /dev/null
+++ b/iree/hal/vulkan/timepoint_util.cc

@@ -0,0 +1,226 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/hal/vulkan/timepoint_util.h"
+
+#include <memory>
+
+#include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
+#include "absl/utility/utility.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/vulkan/dynamic_symbols.h"
+#include "iree/hal/vulkan/status_util.h"
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+
+// static
+void TimePointFence::Delete(TimePointFence* ptr) {
+  ptr->pool()->ReleaseResolved(ptr);
+}
+
+VkResult TimePointFence::GetStatus() {
+  absl::MutexLock lock(&status_mutex_);
+  if (status_ == VK_NOT_READY) {
+    const auto& device = pool()->logical_device();
+    status_ = device->syms()->vkGetFenceStatus(*device, fence_);
+  }
+  return status_;
+}
+
+// static
+StatusOr<ref_ptr<TimePointFencePool>> TimePointFencePool::Create(
+    ref_ptr<VkDeviceHandle> logical_device) {
+  IREE_TRACE_SCOPE0("TimePointFencePool::Create");
+  ref_ptr<TimePointFencePool> pool(
+      new TimePointFencePool(std::move(logical_device)));
+  RETURN_IF_ERROR(pool->PreallocateFences());
+  return pool;
+}
+
+TimePointFencePool::~TimePointFencePool() {
+  IREE_TRACE_SCOPE0("TimePointFencePool::dtor");
+
+  absl::MutexLock lock(&mutex_);
+  int free_count = 0;
+  for (auto* fence : free_fences_) {
+    syms()->vkDestroyFence(*logical_device_, fence->value(),
+                           logical_device_->allocator());
+    ++free_count;
+  }
+  DCHECK_EQ(free_count, kMaxInFlightFenceCount);
+  free_fences_.clear();
+}
+
+StatusOr<ref_ptr<TimePointFence>> TimePointFencePool::Acquire() {
+  IREE_TRACE_SCOPE0("TimePointFencePool::Acquire");
+
+  absl::MutexLock lock(&mutex_);
+  if (free_fences_.empty()) {
+    return ResourceExhaustedErrorBuilder(IREE_LOC)
+           << "Fence pool out of free fences";
+  }
+
+  auto* fence = free_fences_.front();
+  free_fences_.pop_front();
+  return add_ref(fence);
+}
+
+void TimePointFencePool::ReleaseResolved(TimePointFence* fence) {
+  IREE_TRACE_SCOPE0("TimePointFencePool::ReleaseResolved");
+  VkFence f = fence->value();
+  syms()->vkResetFences(*logical_device_, 1, &f);
+  absl::MutexLock lock(&mutex_);
+  free_fences_.push_back(fence);
+}
+
+TimePointFencePool::TimePointFencePool(ref_ptr<VkDeviceHandle> logical_device)
+    : logical_device_(std::move(logical_device)) {}
+
+const ref_ptr<DynamicSymbols>& TimePointFencePool::syms() const {
+  return logical_device_->syms();
+}
+
+Status TimePointFencePool::PreallocateFences() {
+  IREE_TRACE_SCOPE0("TimePointFencePool::PreallocateFences");
+
+  VkFenceCreateInfo create_info;
+  create_info.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
+  create_info.pNext = nullptr;
+  create_info.flags = 0;
+
+  std::array<std::unique_ptr<TimePointFence>, kMaxInFlightFenceCount> fences;
+  {
+    absl::MutexLock lock(&mutex_);
+    for (int i = 0; i < fences.size(); ++i) {
+      VkFence fence = VK_NULL_HANDLE;
+      VK_RETURN_IF_ERROR(syms()->vkCreateFence(*logical_device_, &create_info,
+                                               logical_device_->allocator(),
+                                               &fence));
+      fences[i].reset(new TimePointFence(this, fence));
+    }
+  }
+
+  for (int i = 0; i < fences.size(); ++i) {
+    // The `TimePointFence`s was created with an initial ref-count of one.
+    // Decrease explicitly to zero so that later we can rely on the ref-count
+    // reaching zero to auto-release the `TimePointFence` back to the free
+    // list. As a nice side effect, this will also initialize the free list
+    // with all newly created fences.
+    // TODO: Might want to avoid acquiring and releasing the mutex for each
+    // fence.
+    fences[i].release()->ReleaseReference();
+  }
+
+  return OkStatus();
+}
+
+// static
+StatusOr<ref_ptr<TimePointSemaphorePool>> TimePointSemaphorePool::Create(
+    ref_ptr<VkDeviceHandle> logical_device) {
+  IREE_TRACE_SCOPE0("TimePointSemaphorePool::Create");
+  ref_ptr<TimePointSemaphorePool> pool(
+      new TimePointSemaphorePool(std::move(logical_device)));
+  RETURN_IF_ERROR(pool->PreallocateSemaphores());
+  return pool;
+}
+
+TimePointSemaphorePool::~TimePointSemaphorePool() {
+  IREE_TRACE_SCOPE0("TimePointSemaphorePool::dtor");
+
+  absl::MutexLock lock(&mutex_);
+
+  DCHECK_EQ(free_semaphores_.size(), kMaxInFlightSemaphoreCount);
+  free_semaphores_.clear();
+
+  for (auto& semaphore : storage_) {
+    syms()->vkDestroySemaphore(*logical_device_, semaphore.semaphore,
+                               logical_device_->allocator());
+  }
+}
+
+StatusOr<TimePointSemaphore*> TimePointSemaphorePool::Acquire() {
+  IREE_TRACE_SCOPE0("TimePointSemaphorePool::Acquire");
+
+  absl::MutexLock lock(&mutex_);
+  if (free_semaphores_.empty()) {
+    return ResourceExhaustedErrorBuilder(IREE_LOC)
+           << "Semaphore pool out of free semaphores";
+  }
+
+  auto* semaphore = free_semaphores_.front();
+  free_semaphores_.pop_front();
+  return semaphore;
+}
+
+void TimePointSemaphorePool::ReleaseResolved(
+    IntrusiveList<TimePointSemaphore>* semaphores) {
+  IREE_TRACE_SCOPE0("TimePointSemaphorePool::ReleaseResolved");
+
+  for (auto* semaphore : *semaphores) {
+    DCHECK(!semaphore->signal_fence && !semaphore->wait_fence);
+    semaphore->value = UINT64_MAX;
+  }
+
+  absl::MutexLock lock(&mutex_);
+  free_semaphores_.merge_from(semaphores);
+}
+
+void TimePointSemaphorePool::ReleaseUnresolved(
+    IntrusiveList<TimePointSemaphore>* semaphores) {
+  IREE_TRACE_SCOPE0("TimePointSemaphorePool::ReleaseUnresolved");
+
+  for (auto* semaphore : *semaphores) {
+    semaphore->signal_fence = nullptr;
+    semaphore->wait_fence = nullptr;
+    semaphore->value = UINT64_MAX;
+  }
+
+  absl::MutexLock lock(&mutex_);
+  free_semaphores_.merge_from(semaphores);
+}
+
+TimePointSemaphorePool::TimePointSemaphorePool(
+    ref_ptr<VkDeviceHandle> logical_device)
+    : logical_device_(std::move(logical_device)) {}
+
+const ref_ptr<DynamicSymbols>& TimePointSemaphorePool::syms() const {
+  return logical_device_->syms();
+}
+
+Status TimePointSemaphorePool::PreallocateSemaphores() {
+  IREE_TRACE_SCOPE0("TimePointSemaphorePool::PreallocateSemaphores");
+
+  VkSemaphoreCreateInfo create_info;
+  create_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
+  create_info.pNext = nullptr;
+  create_info.flags = 0;
+
+  absl::MutexLock lock(&mutex_);
+  for (int i = 0; i < kMaxInFlightSemaphoreCount; ++i) {
+    auto* semaphore = &storage_[i];
+    VK_RETURN_IF_ERROR(syms()->vkCreateSemaphore(*logical_device_, &create_info,
+                                                 logical_device_->allocator(),
+                                                 &semaphore->semaphore));
+    free_semaphores_.push_back(semaphore);
+  }
+
+  return OkStatus();
+}
+
+}  // namespace vulkan
+}  // namespace hal
+}  // namespace iree

diff --git a/iree/hal/vulkan/timepoint_util.h b/iree/hal/vulkan/timepoint_util.h
new file mode 100644
index 0000000..e2cb7df
--- /dev/null
+++ b/iree/hal/vulkan/timepoint_util.h

@@ -0,0 +1,210 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_HAL_VULKAN_TIMEPOINT_UTIL_H_
+#define IREE_HAL_VULKAN_TIMEPOINT_UTIL_H_
+
+#include <vulkan/vulkan.h>
+
+#include <atomic>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/synchronization/mutex.h"
+#include "iree/base/intrusive_list.h"
+#include "iree/base/ref_ptr.h"
+#include "iree/base/status.h"
+#include "iree/hal/vulkan/handle_util.h"
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+
+class TimePointFencePool;
+class TimePointSemaphorePool;
+
+// A fence used for tracking progress of timeline semaphores.
+//
+// Each queue submission gets a new `VkFence` associated with it so that we can
+// later query the `VkFence` on CPU to know what time points were signaled for
+// timeline semaphores.
+//
+// Ref-counting allows the fence to be associated with multiple time points from
+// different timelines without worrying about ownership complexity.
+//
+// This is expected to used together with `TimePointFencePool` and must be
+// externally synchronized via `TimePointFencePool`'s mutex.
+class TimePointFence final : public RefObject<TimePointFence>,
+                             public IntrusiveLinkBase<void> {
+ public:
+  TimePointFence(TimePointFencePool* pool, VkFence fence)
+      : pool_(pool), fence_(fence) {}
+
+  TimePointFence(TimePointFence&& that) = delete;
+  TimePointFence& operator=(TimePointFence&&) = delete;
+
+  TimePointFence(const TimePointFence&) = delete;
+  TimePointFence& operator=(const TimePointFence&) = delete;
+
+  // Returns this fence to the pool on destruction.
+  static void Delete(TimePointFence* ptr);
+
+  VkFence value() const noexcept { return fence_; }
+  operator VkFence() const noexcept { return fence_; }
+
+  // Gets the status of this fence object. This might issue an Vulkan API call
+  // under the hood.
+  VkResult GetStatus();
+
+  // Returns the pool from which this fence comes.
+  TimePointFencePool* pool() const { return pool_; }
+
+ private:
+  // The pool from which this fence comes.
+  TimePointFencePool* pool_;
+
+  // Allocated fence that associated with a bunch of time point(s) of
+  // timeline(s). This is passed to queue submission so that we can track the
+  // timeline(s) progress on CPU and schedule work.
+  VkFence fence_;
+
+  // The fence's status.
+  absl::Mutex status_mutex_;
+  VkResult status_ ABSL_GUARDED_BY(status_mutex_) = VK_NOT_READY;
+};
+
+// A semaphore used for emulating a specific time point of timeline semaphores.
+//
+// Each signaled time point in a timeline semaphore is emulated with a new
+// binary `VkSemaphore` associated with queue submission. These time point
+// semaphores are stored in `EmulatedTimelineSemaphore` to quickly scan and
+// process signaled values.
+//
+// This is expected to used together with `TimePointSemaphorePool` and
+// `EmulatedTimelineSemaphore` and must be externally synchronized via their
+// mutexes.
+struct TimePointSemaphore final : public IntrusiveLinkBase<void> {
+  // Allocated binary semaphore that represents a time point in the timeline.
+  // This is passed to queue submission.
+  VkSemaphore semaphore = VK_NULL_HANDLE;
+
+  // Value of the timeline should be at when the binary semaphore is signaled.
+  uint64_t value = UINT64_MAX;
+
+  // The fence associated with the queue submission signaling this semaphore.
+  // nullptr means this binary semaphore has not been submitted to GPU.
+  ref_ptr<TimePointFence> signal_fence = nullptr;
+
+  // The fence associated with the queue submission waiting this semaphore.
+  // nullptr means this binary semaphore has not been waited by any queue
+  // submission.
+  ref_ptr<TimePointFence> wait_fence = nullptr;
+};
+
+// A pool of `VkFence`s that can be used by `EmulatedTimelineSemaphore` to track
+// timeline progress on CPU. Each `VkFence` can be used to query the status of
+// all the semaphores in the same submission to a `VkQueue`.
+class TimePointFencePool final : public RefObject<TimePointFencePool> {
+ public:
+  static constexpr int kMaxInFlightFenceCount = 32;
+
+  // Creates a new pool and pre-allocates `kMaxInFlightFenceCount` fences.
+  static StatusOr<ref_ptr<TimePointFencePool>> Create(
+      ref_ptr<VkDeviceHandle> logical_device);
+
+  ~TimePointFencePool();
+
+  // Acquires a fence from the pool for use by the caller. The fence is
+  // guaranteed to be in unsignaled state and not in-flight on GPU.
+  //
+  // Returns RESOURCE_EXHAUSTED if the pool has no more available fences.
+  // Callers are expected to handle this by waiting on previous fences or for
+  // complete device idle. Yes, that's as bad as it sounds, and if we start
+  // seeing that we should bump up the max count.
+  StatusOr<ref_ptr<TimePointFence>> Acquire();
+
+  // Releases one fence back to the pool. The fence must either be signaled or
+  // not be in flight on GPU.
+  void ReleaseResolved(TimePointFence* fence);
+
+  const ref_ptr<VkDeviceHandle>& logical_device() const {
+    return logical_device_;
+  }
+
+ private:
+  explicit TimePointFencePool(ref_ptr<VkDeviceHandle> logical_device);
+
+  const ref_ptr<DynamicSymbols>& syms() const;
+
+  Status PreallocateFences() ABSL_LOCKS_EXCLUDED(mutex_);
+
+  ref_ptr<VkDeviceHandle> logical_device_;
+
+  absl::Mutex mutex_;
+
+  IntrusiveList<TimePointFence> free_fences_ ABSL_GUARDED_BY(mutex_);
+};
+
+// A pool of `VkSemaphore`s that can be used by `EmulatedTimelineSemaphore` to
+// simulate individual timeline value signaling.
+class TimePointSemaphorePool final : public RefObject<TimePointSemaphorePool> {
+ public:
+  static constexpr int kMaxInFlightSemaphoreCount = 64;
+
+  // Creates a new pool and pre-allocates `kMaxInFlightSemaphoreCount` binary
+  // semaphores.
+  static StatusOr<ref_ptr<TimePointSemaphorePool>> Create(
+      ref_ptr<VkDeviceHandle> logical_device);
+
+  ~TimePointSemaphorePool();
+
+  // Acquires a binary semaphore from the pool for use by the caller. The
+  // semaphore is guaranteed to be in unsignaled state and not in-flight on GPU.
+  //
+  // Returns RESOURCE_EXHAUSTED if the pool has no more available semaphores.
+  // Callers are expected to handle this by waiting on previous fences or for
+  // complete device idle. Yes, that's as bad as it sounds, and if we start
+  // seeing that we should bump up the max count.
+  StatusOr<TimePointSemaphore*> Acquire();
+
+  // Releases one or more semaphores back to the pool. The binary semaphore must
+  // be unsignaled and not in flight on GPU.
+  void ReleaseResolved(IntrusiveList<TimePointSemaphore>* semaphores);
+
+  // Releases one or more semaphores back to the pool. These may be in any state
+  // and will be assumed as untouchable; the pool will unconditionally recycle
+  // them.
+  void ReleaseUnresolved(IntrusiveList<TimePointSemaphore>* semaphores);
+
+ private:
+  explicit TimePointSemaphorePool(ref_ptr<VkDeviceHandle> logical_device);
+
+  const ref_ptr<DynamicSymbols>& syms() const;
+
+  Status PreallocateSemaphores() ABSL_LOCKS_EXCLUDED(mutex_);
+
+  ref_ptr<VkDeviceHandle> logical_device_;
+
+  absl::Mutex mutex_;
+
+  std::array<TimePointSemaphore, kMaxInFlightSemaphoreCount> storage_
+      ABSL_GUARDED_BY(mutex_);
+  IntrusiveList<TimePointSemaphore> free_semaphores_ ABSL_GUARDED_BY(mutex_);
+};
+
+}  // namespace vulkan
+}  // namespace hal
+}  // namespace iree
+
+#endif  // IREE_HAL_VULKAN_TIMEPOINT_UTIL_H_

diff --git a/iree/hal/vulkan/vulkan_device.cc b/iree/hal/vulkan/vulkan_device.cc
index 3c7e37b..2f2d16e 100644
--- a/iree/hal/vulkan/vulkan_device.cc
+++ b/iree/hal/vulkan/vulkan_device.cc

@@ -30,12 +30,14 @@
 #include "iree/hal/vulkan/direct_command_buffer.h"
 #include "iree/hal/vulkan/direct_command_queue.h"
 #include "iree/hal/vulkan/dynamic_symbols.h"
+#include "iree/hal/vulkan/emulated_timeline_semaphore.h"
 #include "iree/hal/vulkan/extensibility_util.h"
 #include "iree/hal/vulkan/native_descriptor_set.h"
 #include "iree/hal/vulkan/native_event.h"
 #include "iree/hal/vulkan/native_timeline_semaphore.h"
 #include "iree/hal/vulkan/pipeline_cache.h"
 #include "iree/hal/vulkan/pipeline_executable_layout.h"
+#include "iree/hal/vulkan/serializing_command_queue.h"
 #include "iree/hal/vulkan/status_util.h"
 #include "iree/hal/vulkan/vma_allocator.h"
 
@@ -164,6 +166,7 @@
     const DeviceInfo& device_info,
     const ref_ptr<VkDeviceHandle>& logical_device,
     const QueueSet& compute_queue_set, const QueueSet& transfer_queue_set,
+    const ref_ptr<TimePointFencePool>& fence_pool,
     const ref_ptr<DynamicSymbols>& syms) {
   absl::InlinedVector<std::unique_ptr<CommandQueue>, 4> command_queues;
 
@@ -175,10 +178,17 @@
     syms->vkGetDeviceQueue(*logical_device,
                            compute_queue_set.queue_family_index, i, &queue);
     std::string queue_name = absl::StrCat(device_info.name(), ":d", i);
+#if IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES
+    command_queues.push_back(absl::make_unique<SerializingCommandQueue>(
+        std::move(queue_name),
+        CommandCategory::kDispatch | CommandCategory::kTransfer, logical_device,
+        fence_pool, queue));
+#else
     command_queues.push_back(absl::make_unique<DirectCommandQueue>(
         std::move(queue_name),
         CommandCategory::kDispatch | CommandCategory::kTransfer, logical_device,
         queue));
+#endif  // IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES
   }
 
   uint64_t transfer_queue_count = CountOnes64(transfer_queue_set.queue_indices);
@@ -189,9 +199,15 @@
     syms->vkGetDeviceQueue(*logical_device,
                            transfer_queue_set.queue_family_index, i, &queue);
     std::string queue_name = absl::StrCat(device_info.name(), ":t", i);
+#if IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES
+    command_queues.push_back(absl::make_unique<SerializingCommandQueue>(
+        std::move(queue_name), CommandCategory::kTransfer, logical_device,
+        fence_pool, queue));
+#else
     command_queues.push_back(absl::make_unique<DirectCommandQueue>(
         std::move(queue_name), CommandCategory::kTransfer, logical_device,
         queue));
+#endif  // IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES
   }
 
   return command_queues;
@@ -354,14 +370,27 @@
   for (uint32_t i = 0; i < queue_family_info.transfer_queue_count; ++i) {
     transfer_queue_set.queue_indices |= 1 << (i + base_queue_index);
   }
-  auto command_queues = CreateCommandQueues(
-      device_info, logical_device, compute_queue_set, transfer_queue_set, syms);
+
+#if IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES
+  ASSIGN_OR_RETURN(auto semaphore_pool,
+                   TimePointSemaphorePool::Create(add_ref(logical_device)));
+  ASSIGN_OR_RETURN(auto fence_pool,
+                   TimePointFencePool::Create(add_ref(logical_device)));
+#else
+  ref_ptr<TimePointSemaphorePool> semaphore_pool = nullptr;
+  ref_ptr<TimePointFencePool> fence_pool = nullptr;
+#endif  // IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES
+
+  auto command_queues =
+      CreateCommandQueues(device_info, logical_device, compute_queue_set,
+                          transfer_queue_set, fence_pool, syms);
 
   return assign_ref(new VulkanDevice(
       std::move(driver), device_info, physical_device,
       std::move(logical_device), std::move(allocator),
       std::move(command_queues), std::move(dispatch_command_pool),
-      std::move(transfer_command_pool), debug_capture_manager));
+      std::move(transfer_command_pool), std::move(semaphore_pool),
+      std::move(fence_pool), debug_capture_manager));
 }
 
 // static
@@ -421,13 +450,25 @@
                          device_handle, transfer_queue_set.queue_family_index));
   }
 
-  auto command_queues = CreateCommandQueues(
-      device_info, device_handle, compute_queue_set, transfer_queue_set, syms);
+#if IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES
+  ASSIGN_OR_RETURN(auto semaphore_pool,
+                   TimePointSemaphorePool::Create(add_ref(device_handle)));
+  ASSIGN_OR_RETURN(auto fence_pool,
+                   TimePointFencePool::Create(add_ref(device_handle)));
+#else
+  ref_ptr<TimePointSemaphorePool> semaphore_pool = nullptr;
+  ref_ptr<TimePointFencePool> fence_pool = nullptr;
+#endif  // IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES
+
+  auto command_queues =
+      CreateCommandQueues(device_info, device_handle, compute_queue_set,
+                          transfer_queue_set, fence_pool, syms);
 
   return assign_ref(new VulkanDevice(
       std::move(driver), device_info, physical_device, std::move(device_handle),
       std::move(allocator), std::move(command_queues),
       std::move(dispatch_command_pool), std::move(transfer_command_pool),
+      std::move(semaphore_pool), std::move(fence_pool),
       /*debug_capture_manager=*/nullptr));
 }
 
@@ -438,6 +479,8 @@
     absl::InlinedVector<std::unique_ptr<CommandQueue>, 4> command_queues,
     ref_ptr<VkCommandPoolHandle> dispatch_command_pool,
     ref_ptr<VkCommandPoolHandle> transfer_command_pool,
+    ref_ptr<TimePointSemaphorePool> semaphore_pool,
+    ref_ptr<TimePointFencePool> fence_pool,
     DebugCaptureManager* debug_capture_manager)
     : Device(device_info),
       driver_(std::move(driver)),
@@ -449,6 +492,8 @@
           make_ref<DescriptorPoolCache>(add_ref(logical_device_))),
       dispatch_command_pool_(std::move(dispatch_command_pool)),
       transfer_command_pool_(std::move(transfer_command_pool)),
+      semaphore_pool_(std::move(semaphore_pool)),
+      fence_pool_(std::move(fence_pool)),
       debug_capture_manager_(debug_capture_manager) {
   // Populate the queue lists based on queue capabilities.
   for (auto& command_queue : command_queues_) {
@@ -650,8 +695,36 @@
 StatusOr<ref_ptr<Semaphore>> VulkanDevice::CreateSemaphore(
     uint64_t initial_value) {
   IREE_TRACE_SCOPE0("VulkanDevice::CreateSemaphore");
+#if IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES
+  return EmulatedTimelineSemaphore::Create(
+      add_ref(logical_device_),
+      // Triggers necessary processing on all queues due to new values gotten
+      // signaled for the given timeline |semaphore|.
+      // Different clang-format versions disagree about the asterisk placement.
+      // clang-format off
+      [this](Semaphore* /*semaphore*/) -> Status {
+        // clang-format on
+        IREE_TRACE_SCOPE0("<lambda>::OnSemaphoreSignal");
+        for (const auto& queue : command_queues_) {
+          RETURN_IF_ERROR(static_cast<SerializingCommandQueue*>(queue.get())
+                              ->AdvanceQueueSubmission());
+        }
+        return OkStatus();
+      },
+      // Triggers necessary processing on all queues due to failures for the
+      // given timeline |semaphore|.
+      [this](Semaphore* /*semaphore*/) {
+        IREE_TRACE_SCOPE0("<lambda>::OnSemaphoreFailure");
+        for (const auto& queue : command_queues_) {
+          static_cast<SerializingCommandQueue*>(queue.get())
+              ->AbortQueueSubmission();
+        }
+      },
+      add_ref(semaphore_pool_), initial_value);
+#else
   return NativeTimelineSemaphore::Create(add_ref(logical_device_),
                                          initial_value);
+#endif  // IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES
 }
 
 Status VulkanDevice::WaitAllSemaphores(
@@ -672,6 +745,23 @@
                                     VkSemaphoreWaitFlags wait_flags) {
   IREE_TRACE_SCOPE0("VulkanDevice::WaitSemaphores");
 
+#if IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES
+
+  // TODO(antiagainst): We actually should get the fences associated with the
+  // emulated timeline semaphores so that we can wait them in a bunch. This
+  // implementation is problematic if we wait to wait any and we have the first
+  // semaphore taking extra long time but the following ones signal quickly.
+  for (int i = 0; i < semaphores.size(); ++i) {
+    auto* semaphore =
+        static_cast<EmulatedTimelineSemaphore*>(semaphores[i].semaphore);
+    RETURN_IF_ERROR(semaphore->Wait(semaphores[i].value, deadline));
+    if (wait_flags & VK_SEMAPHORE_WAIT_ANY_BIT) return OkStatus();
+  }
+
+  return OkStatus();
+
+#else
+
   absl::InlinedVector<VkSemaphore, 4> semaphore_handles(semaphores.size());
   absl::InlinedVector<uint64_t, 4> semaphore_values(semaphores.size());
   for (int i = 0; i < semaphores.size(); ++i) {
@@ -714,6 +804,8 @@
   // semaphores we waited on (including those already expired above).
 
   return OkStatus();
+
+#endif  // IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES
 }
 
 Status VulkanDevice::WaitIdle(absl::Time deadline) {

diff --git a/iree/hal/vulkan/vulkan_device.h b/iree/hal/vulkan/vulkan_device.h
index cfceb4f..ce7c9d7 100644
--- a/iree/hal/vulkan/vulkan_device.h
+++ b/iree/hal/vulkan/vulkan_device.h

@@ -30,6 +30,7 @@
 #include "iree/hal/semaphore.h"
 #include "iree/hal/vulkan/descriptor_pool_cache.h"
 #include "iree/hal/vulkan/dynamic_symbols.h"
+#include "iree/hal/vulkan/emulated_timeline_semaphore.h"
 #include "iree/hal/vulkan/extensibility_util.h"
 #include "iree/hal/vulkan/handle_util.h"
 
@@ -119,6 +120,8 @@
       absl::InlinedVector<std::unique_ptr<CommandQueue>, 4> command_queues,
       ref_ptr<VkCommandPoolHandle> dispatch_command_pool,
       ref_ptr<VkCommandPoolHandle> transfer_command_pool,
+      ref_ptr<TimePointSemaphorePool> semaphore_pool,
+      ref_ptr<TimePointFencePool> fence_pool,
       DebugCaptureManager* debug_capture_manager);
 
   Status WaitSemaphores(absl::Span<const SemaphoreValue> semaphores,
@@ -139,6 +142,10 @@
   ref_ptr<VkCommandPoolHandle> dispatch_command_pool_;
   ref_ptr<VkCommandPoolHandle> transfer_command_pool_;
 
+  // Fields used for emulated timeline semaphores.
+  ref_ptr<TimePointSemaphorePool> semaphore_pool_;
+  ref_ptr<TimePointFencePool> fence_pool_;
+
   DebugCaptureManager* debug_capture_manager_ = nullptr;
 };
 

diff --git a/iree/hal/vulkan/vulkan_driver_module.cc b/iree/hal/vulkan/vulkan_driver_module.cc
index 4b98f58..f034127 100644
--- a/iree/hal/vulkan/vulkan_driver_module.cc
+++ b/iree/hal/vulkan/vulkan_driver_module.cc

@@ -67,9 +67,11 @@
   // promoted to core, so we list it as optional even though we require it.
   options.instance_extensibility.optional_extensions.push_back(
       VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME);
+#if IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES == 0
   // Timeline semaphore support is required.
   options.device_extensibility.required_extensions.push_back(
       VK_KHR_TIMELINE_SEMAPHORE_EXTENSION_NAME);
+#endif
 
   if (absl::GetFlag(FLAGS_vulkan_validation_layers)) {
     options.instance_extensibility.optional_layers.push_back(

diff --git a/iree/test/e2e/xla_ops/BUILD b/iree/test/e2e/xla_ops/BUILD
index c8ea2fb..6f95110 100644
--- a/iree/test/e2e/xla_ops/BUILD
+++ b/iree/test/e2e/xla_ops/BUILD

@@ -48,11 +48,6 @@
         "convert.mlir",
         "concatenate.mlir",
         "constant.mlir",
-
-        # TODO(#1687): Enable after casting from fp to int is handled
-        # on structured ops path in vulkan
-        # "convert.mlir",
-        #
         "cosine.mlir",
         "divide.mlir",
         "dot.mlir",
@@ -76,7 +71,8 @@
         "reduce_window.mlir",
         "remainder.mlir",
         "reshape.mlir",
-        "reverse.mlir",
+        # TODO(#1699): Enable after xla_hlo.reverse can be lowered to linalg.
+        # "reverse.mlir",
         "rsqrt.mlir",
         "select.mlir",
         "sine.mlir",

diff --git a/iree/test/e2e/xla_ops/CMakeLists.txt b/iree/test/e2e/xla_ops/CMakeLists.txt
index 7f65b06..b2eec93 100644
--- a/iree/test/e2e/xla_ops/CMakeLists.txt
+++ b/iree/test/e2e/xla_ops/CMakeLists.txt

@@ -56,7 +56,6 @@
     "reduce_window.mlir"
     "remainder.mlir"
     "reshape.mlir"
-    "reverse.mlir"
     "rsqrt.mlir"
     "select.mlir"
     "sine.mlir"

diff --git a/iree/tools/CMakeLists.txt b/iree/tools/CMakeLists.txt
index fce935b..5267dec 100644
--- a/iree/tools/CMakeLists.txt
+++ b/iree/tools/CMakeLists.txt

@@ -101,6 +101,7 @@
       iree::compiler::Dialect::VM::Tools
     LINKOPTS
       "-lpthread"
+    HOSTONLY
   )
 endif()
 
@@ -253,6 +254,31 @@
     PUBLIC
   )
 
+  iree_cc_library(
+    NAME
+      iree_translate_main
+    SRCS
+      "translate_main.cc"
+    DEPS
+      ::init_compiler_modules
+      ::init_iree_passes_and_dialects
+      ::init_mlir_passes_and_dialects
+      ::init_targets
+      ::init_translations
+      ::init_xla_dialects
+      LLVMSupport
+      MLIRIR
+      MLIRSCFTransforms
+      MLIRPass
+      MLIRSupport
+      MLIRTranslation
+      iree::compiler::Conversion::init_conversions
+      iree::compiler::Dialect::VM::Target::Bytecode
+      iree::compiler::Dialect::VM::Target::init_targets
+      iree::compiler::Translation::IREEVM
+    PUBLIC
+  )
+
   iree_cc_binary(
     NAME
       iree-opt
@@ -260,6 +286,7 @@
       iree-opt
     DEPS
       ::iree_opt_main
+    HOSTONLY
   )
 
   iree_cc_binary(
@@ -303,33 +330,14 @@
       iree::vm::bytecode_module
       iree::vm::value
       ${IREE_HAL_DRIVER_MODULES}
+    HOSTONLY
   )
+endif(${IREE_BUILD_COMPILER})
 
-  iree_cc_library(
-    NAME
-      iree_translate_main
-    SRCS
-      "translate_main.cc"
-    DEPS
-      ::init_compiler_modules
-      ::init_iree_passes_and_dialects
-      ::init_mlir_passes_and_dialects
-      ::init_targets
-      ::init_translations
-      ::init_xla_dialects
-      LLVMSupport
-      MLIRIR
-      MLIRSCFTransforms
-      MLIRPass
-      MLIRSupport
-      MLIRTranslation
-      iree::compiler::Conversion::init_conversions
-      iree::compiler::Dialect::VM::Target::Bytecode
-      iree::compiler::Dialect::VM::Target::init_targets
-      iree::compiler::Translation::IREEVM
-    PUBLIC
-  )
-
+# If cross-compiling, we need to declare iree-translate under host configuration
+# unconditionally because we need to run it on host to generate VM modules
+# for tests.
+if(${IREE_BUILD_COMPILER} OR CMAKE_CROSSCOMPILING)
   iree_cc_binary(
     NAME
       iree-translate
@@ -337,6 +345,7 @@
       iree-translate
     DEPS
       ::iree_translate_main
+    HOSTONLY
   )
 endif()
 

diff --git a/iree/vm/test/BUILD b/iree/vm/test/BUILD
index 28312bc..fb9848d 100644
--- a/iree/vm/test/BUILD
+++ b/iree/vm/test/BUILD

@@ -35,13 +35,11 @@
 iree_bytecode_module(
     name = "arithmetic_ops",
     src = "arithmetic_ops.mlir",
-    cc_namespace = "iree::vm::test",
     flags = ["-iree-vm-ir-to-bytecode-module"],
 )
 
 iree_bytecode_module(
     name = "control_flow_ops",
     src = "control_flow_ops.mlir",
-    cc_namespace = "iree::vm::test",
     flags = ["-iree-vm-ir-to-bytecode-module"],
 )

diff --git a/iree/vm/test/CMakeLists.txt b/iree/vm/test/CMakeLists.txt
index 7f9a4ae..6c189b6 100644
--- a/iree/vm/test/CMakeLists.txt
+++ b/iree/vm/test/CMakeLists.txt

@@ -35,8 +35,6 @@
     arithmetic_ops
   SRC
     "arithmetic_ops.mlir"
-  CC_NAMESPACE
-    "iree::vm::test"
   FLAGS
     "-iree-vm-ir-to-bytecode-module"
   PUBLIC
@@ -47,8 +45,6 @@
     control_flow_ops
   SRC
     "control_flow_ops.mlir"
-  CC_NAMESPACE
-    "iree::vm::test"
   FLAGS
     "-iree-vm-ir-to-bytecode-module"
   PUBLIC

diff --git a/kokoro/gcp_ubuntu/bazel/bindings/continuous.cfg b/kokoro/gcp_ubuntu/bazel/bindings/continuous.cfg
index af84216..50a7eed 100644
--- a/kokoro/gcp_ubuntu/bazel/bindings/continuous.cfg
+++ b/kokoro/gcp_ubuntu/bazel/bindings/continuous.cfg

@@ -15,5 +15,5 @@
 # limitations under the License.
 
 # Deliberately blank as everything necessary is configured in common files, but
-# file must still exist to match corresponding (upstream only) job
+# file must still exist to match corresponding (Google internal) job
 # configurations that trigger the builds.

diff --git a/kokoro/gcp_ubuntu/bazel/bindings/google.cfg b/kokoro/gcp_ubuntu/bazel/bindings/google.cfg
new file mode 100644
index 0000000..50a7eed
--- /dev/null
+++ b/kokoro/gcp_ubuntu/bazel/bindings/google.cfg

@@ -0,0 +1,19 @@
+# Format: //devtools/kokoro/config/proto/build.proto
+
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Deliberately blank as everything necessary is configured in common files, but
+# file must still exist to match corresponding (Google internal) job
+# configurations that trigger the builds.

diff --git a/kokoro/gcp_ubuntu/bazel/bindings/main.cfg b/kokoro/gcp_ubuntu/bazel/bindings/main.cfg
new file mode 100644
index 0000000..50a7eed
--- /dev/null
+++ b/kokoro/gcp_ubuntu/bazel/bindings/main.cfg

@@ -0,0 +1,19 @@
+# Format: //devtools/kokoro/config/proto/build.proto
+
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Deliberately blank as everything necessary is configured in common files, but
+# file must still exist to match corresponding (Google internal) job
+# configurations that trigger the builds.

diff --git a/kokoro/gcp_ubuntu/bazel/bindings/presubmit.cfg b/kokoro/gcp_ubuntu/bazel/bindings/presubmit.cfg
index af84216..50a7eed 100644
--- a/kokoro/gcp_ubuntu/bazel/bindings/presubmit.cfg
+++ b/kokoro/gcp_ubuntu/bazel/bindings/presubmit.cfg

@@ -15,5 +15,5 @@
 # limitations under the License.
 
 # Deliberately blank as everything necessary is configured in common files, but
-# file must still exist to match corresponding (upstream only) job
+# file must still exist to match corresponding (Google internal) job
 # configurations that trigger the builds.

diff --git a/kokoro/gcp_ubuntu/bazel/core/continuous.cfg b/kokoro/gcp_ubuntu/bazel/core/continuous.cfg
index af84216..50a7eed 100755
--- a/kokoro/gcp_ubuntu/bazel/core/continuous.cfg
+++ b/kokoro/gcp_ubuntu/bazel/core/continuous.cfg

@@ -15,5 +15,5 @@
 # limitations under the License.
 
 # Deliberately blank as everything necessary is configured in common files, but
-# file must still exist to match corresponding (upstream only) job
+# file must still exist to match corresponding (Google internal) job
 # configurations that trigger the builds.

diff --git a/kokoro/gcp_ubuntu/bazel/core/google.cfg b/kokoro/gcp_ubuntu/bazel/core/google.cfg
new file mode 100755
index 0000000..50a7eed
--- /dev/null
+++ b/kokoro/gcp_ubuntu/bazel/core/google.cfg

@@ -0,0 +1,19 @@
+# Format: //devtools/kokoro/config/proto/build.proto
+
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Deliberately blank as everything necessary is configured in common files, but
+# file must still exist to match corresponding (Google internal) job
+# configurations that trigger the builds.

diff --git a/kokoro/gcp_ubuntu/bazel/core/main.cfg b/kokoro/gcp_ubuntu/bazel/core/main.cfg
new file mode 100755
index 0000000..50a7eed
--- /dev/null
+++ b/kokoro/gcp_ubuntu/bazel/core/main.cfg

@@ -0,0 +1,19 @@
+# Format: //devtools/kokoro/config/proto/build.proto
+
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Deliberately blank as everything necessary is configured in common files, but
+# file must still exist to match corresponding (Google internal) job
+# configurations that trigger the builds.

diff --git a/kokoro/gcp_ubuntu/bazel/core/presubmit.cfg b/kokoro/gcp_ubuntu/bazel/core/presubmit.cfg
index af84216..50a7eed 100755
--- a/kokoro/gcp_ubuntu/bazel/core/presubmit.cfg
+++ b/kokoro/gcp_ubuntu/bazel/core/presubmit.cfg

@@ -15,5 +15,5 @@
 # limitations under the License.
 
 # Deliberately blank as everything necessary is configured in common files, but
-# file must still exist to match corresponding (upstream only) job
+# file must still exist to match corresponding (Google internal) job
 # configurations that trigger the builds.

diff --git a/kokoro/gcp_ubuntu/bazel/integrations/continuous.cfg b/kokoro/gcp_ubuntu/bazel/integrations/continuous.cfg
index af84216..50a7eed 100644
--- a/kokoro/gcp_ubuntu/bazel/integrations/continuous.cfg
+++ b/kokoro/gcp_ubuntu/bazel/integrations/continuous.cfg

@@ -15,5 +15,5 @@
 # limitations under the License.
 
 # Deliberately blank as everything necessary is configured in common files, but
-# file must still exist to match corresponding (upstream only) job
+# file must still exist to match corresponding (Google internal) job
 # configurations that trigger the builds.

diff --git a/kokoro/gcp_ubuntu/bazel/integrations/google.cfg b/kokoro/gcp_ubuntu/bazel/integrations/google.cfg
new file mode 100644
index 0000000..50a7eed
--- /dev/null
+++ b/kokoro/gcp_ubuntu/bazel/integrations/google.cfg

@@ -0,0 +1,19 @@
+# Format: //devtools/kokoro/config/proto/build.proto
+
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Deliberately blank as everything necessary is configured in common files, but
+# file must still exist to match corresponding (Google internal) job
+# configurations that trigger the builds.

diff --git a/kokoro/gcp_ubuntu/bazel/integrations/main.cfg b/kokoro/gcp_ubuntu/bazel/integrations/main.cfg
new file mode 100644
index 0000000..50a7eed
--- /dev/null
+++ b/kokoro/gcp_ubuntu/bazel/integrations/main.cfg

@@ -0,0 +1,19 @@
+# Format: //devtools/kokoro/config/proto/build.proto
+
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Deliberately blank as everything necessary is configured in common files, but
+# file must still exist to match corresponding (Google internal) job
+# configurations that trigger the builds.

diff --git a/kokoro/gcp_ubuntu/bazel/integrations/presubmit.cfg b/kokoro/gcp_ubuntu/bazel/integrations/presubmit.cfg
index af84216..50a7eed 100644
--- a/kokoro/gcp_ubuntu/bazel/integrations/presubmit.cfg
+++ b/kokoro/gcp_ubuntu/bazel/integrations/presubmit.cfg

@@ -15,5 +15,5 @@
 # limitations under the License.
 
 # Deliberately blank as everything necessary is configured in common files, but
-# file must still exist to match corresponding (upstream only) job
+# file must still exist to match corresponding (Google internal) job
 # configurations that trigger the builds.

diff --git a/kokoro/gcp_ubuntu/cmake/continuous.cfg b/kokoro/gcp_ubuntu/cmake/continuous.cfg
index d825b90..e4cc270 100644
--- a/kokoro/gcp_ubuntu/cmake/continuous.cfg
+++ b/kokoro/gcp_ubuntu/cmake/continuous.cfg

@@ -15,5 +15,5 @@
 # limitations under the License.
 
 # Deliberately blank as everything necessary is configured in common files, but
-# file must still exist to match corresponding (upstream only) job
+# file must still exist to match corresponding (Google internal) job
 # configurations that trigger the builds.

diff --git a/kokoro/gcp_ubuntu/cmake/google.cfg b/kokoro/gcp_ubuntu/cmake/google.cfg
new file mode 100644
index 0000000..e4cc270
--- /dev/null
+++ b/kokoro/gcp_ubuntu/cmake/google.cfg

@@ -0,0 +1,19 @@
+# Format: //devtools/kokoro/config/proto/build.proto
+
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Deliberately blank as everything necessary is configured in common files, but
+# file must still exist to match corresponding (Google internal) job
+# configurations that trigger the builds.

diff --git a/kokoro/gcp_ubuntu/cmake/main.cfg b/kokoro/gcp_ubuntu/cmake/main.cfg
new file mode 100644
index 0000000..e4cc270
--- /dev/null
+++ b/kokoro/gcp_ubuntu/cmake/main.cfg

@@ -0,0 +1,19 @@
+# Format: //devtools/kokoro/config/proto/build.proto
+
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Deliberately blank as everything necessary is configured in common files, but
+# file must still exist to match corresponding (Google internal) job
+# configurations that trigger the builds.

diff --git a/kokoro/gcp_ubuntu/cmake/presubmit.cfg b/kokoro/gcp_ubuntu/cmake/presubmit.cfg
index d825b90..e4cc270 100644
--- a/kokoro/gcp_ubuntu/cmake/presubmit.cfg
+++ b/kokoro/gcp_ubuntu/cmake/presubmit.cfg

@@ -15,5 +15,5 @@
 # limitations under the License.
 
 # Deliberately blank as everything necessary is configured in common files, but
-# file must still exist to match corresponding (upstream only) job
+# file must still exist to match corresponding (Google internal) job
 # configurations that trigger the builds.

diff --git a/third_party/llvm-project b/third_party/llvm-project
index 7e825ab..9fb7e98 160000
--- a/third_party/llvm-project
+++ b/third_party/llvm-project

@@ -1 +1 @@
-Subproject commit 7e825abd5704ce28b166f9463d4bd304348fd2a9
+Subproject commit 9fb7e98db5aaef617878a127b663efa4d01aa834

diff --git a/third_party/tensorflow b/third_party/tensorflow
index b00a780..f74654a 160000
--- a/third_party/tensorflow
+++ b/third_party/tensorflow

@@ -1 +1 @@
-Subproject commit b00a7808a7b29a78762b54e29aac87a77254b4b6
+Subproject commit f74654ac7b314a212b1df6687c2f99800084e97f